pkgs <- c("tidyverse", "tidylog", "devtools", "sdmTMB", "RCurl", "purrr", "janitor", "patchwork", "scales", "mapplots", "worrms", "fuzzyjoin", "rfishbase", "furrr", "broom", "mgcv", "tidygam", "terra", "rnaturalearth","rnaturalearthdata")
if(length(setdiff(pkgs, rownames(installed.packages()))) > 0){
install.packages(setdiff(pkgs, rownames(installed.packages())), dependencies = T)
}
invisible(lapply(pkgs, library, character.only = T))
# Source code for map plots
devtools::source_url("https://raw.githubusercontent.com/VThunell/Lammska_cod-fr/main/R/functions/map-plot.R")
options(ggplot2.continuous.colour = "viridis")
# Set path
home <- here::here()Prepare stomach content data v.2
Introduction
This script prepares stomach content data from three sources: 1) the ICES stomach content database with data from the Baltic Sea, 2) a copy the previous version of the ICES stomach content database and 3) recent Swedish observations not yet uploaded to the ICES database.
The output df contains one row per unique predator, their total and prey gruoup specific relative prey weight (rpw) along with necessary information used in analysis (coorinates, time etc.). To get there, in short, we summarise prey information per predator individual an combine the datasets. Furthermore, we add taxonomy (from the World register of marine species, Worms) and taxonomic rank specific length-weight relationships (using published data and [Fishbase])()), add weight predator or prey information when there is length available and then we do some additional cleaning (see below).
The stomach content database for the Baltic sea was downloaded twice. We use data for Lativa dowloaded at 16:21 Jan 15 2024 and for all other countries, year 1963-2021) was downloaded
Load libraries
1. Read data
Stomach content data i) current data from ICES stomach database (new data base, NDB) ii) Old database (ODB) data (since many observations are missing from the NDB) iii) Additional newer data (SE)
Additional information i) BITS haul information from DATRAS ii) length-weight coefficients from publication
# New database data, NDB are divided into four sheets
# New data from January 2024 for Latvia: StomachContent_0115213707
# New data (October 2024) that contains less stomachs for Other countries: StomachContent_1021080172
fi <- read_csv(paste0(home, "/data/stomach/StomachContent_0115213707/File_information.csv")) |> filter(Country == "LV") |>
bind_rows(read_csv(paste0(home, "/data/stomach/StomachContent_1021080172/File_information.csv")) |> filter(!Country == "LV"))Rows: 71 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Country, CruiseID
dbl (2): tblUploadID, Reporting_organisation
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 9 rows (13%), 62 rows remaining
Rows: 12 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Country, CruiseID
dbl (2): tblUploadID, Reporting_organisation
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: no rows removed
# For the other three sheets for the NDB we use the tblUploadID for excluding and including LV.
hi <- read_csv(paste0(home, "/data/stomach/StomachContent_0115213707/HaulInformation.csv"),
col_types = list(
tblUploadID = col_double(),
tblHaulID = col_double(),
Ship = col_character(),
Gear = col_character(),
HaulNo = col_double(),
StationNumber = col_double(),
Year = col_double(),
Month = col_double(),
Day = col_double(),
Time = col_character(),
ShootLat = col_double(),
ShootLong = col_double(),
HaulLat = col_double(),
HaulLong = col_double(),
ICESrectangle = col_character(),
Depth = col_double(),
Survey = col_character(),
ICESDatabase = col_character(),
Notes = col_character())) |> # the parsing issues of hi causes no problems but specifying col_types corrects the datatypes
filter(tblUploadID %in% (fi |> filter(Country == "LV") |> pull(tblUploadID))) |>
bind_rows(read_csv(paste0(home, "/data/stomach/StomachContent_1021080172/HaulInformation.csv"),
col_types = list(
tblUploadID = col_double(),
tblHaulID = col_double(),
Ship = col_character(),
Gear = col_character(),
HaulNo = col_double(),
StationNumber = col_double(),
Year = col_double(),
Month = col_double(),
Day = col_double(),
Time = col_character(),
ShootLat = col_double(),
ShootLong = col_double(),
HaulLat = col_double(),
HaulLong = col_double(),
ICESrectangle = col_character(),
Depth = col_double(),
Survey = col_character(),
ICESDatabase = col_character(),
Notes = col_character())) |>
filter(tblUploadID %in% (fi |> filter(!Country == "LV") |> pull(tblUploadID)))) filter: removed 12 rows (16%), 62 rows remaining
filter: removed 203 rows (13%), 1,363 rows remaining
filter: removed 62 rows (84%), 12 rows remaining
filter: no rows removed
pred <- read_csv(paste0(home, "/data/stomach/StomachContent_0115213707/PredatorInformation_edit.csv")) |> # edited in text editor (not excel or equivalent) to remove three instances of erroneous '"' in column 'Notes' at row 7139,7144 and 7296.
filter(tblUploadID %in% (fi |> filter(Country == "LV") |> pull(tblUploadID))) |>
bind_rows(read_csv(paste0(home, "/data/stomach/StomachContent_1021080172/PredatorInformation.csv")) |> filter(tblUploadID %in% (fi |> filter(!Country == "LV") |> pull(tblUploadID))))Rows: 36980 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): Ship, Gear, Time, Code, Sex, MaturityScale, PreservationMethod, Ge...
dbl (18): tblUploadID, tblHaulID, tblPredatorInformationID, HaulNo, StationN...
lgl (3): StomachFullness, FullStomWgt, EmptyStomWgt
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 12 rows (16%), 62 rows remaining
filter: removed 3,504 rows (9%), 33,476 rows remaining
Rows: 5801 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): Ship, Gear, Time, Code, Sex, MaturityScale, PreservationMethod, Ge...
dbl (18): tblUploadID, tblHaulID, tblPredatorInformationID, HaulNo, StationN...
lgl (3): StomachFullness, FullStomWgt, EmptyStomWgt
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 62 rows (84%), 12 rows remaining
filter: no rows removed
prey <- read_csv(paste0(home, "/data/stomach/StomachContent_0115213707/PreyInformation.csv")) |>
filter(tblUploadID %in% (fi |> filter(Country == "LV") |> pull(tblUploadID))) |>
bind_rows(read_csv(paste0(home, "/data/stomach/StomachContent_1021080172/PreyInformation.csv")) |> filter(tblUploadID %in% (fi |> filter(!Country == "LV") |> pull(tblUploadID))))Rows: 50170 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): Ship, Gear, Time, IdentMet, GravMethod, UnitWgt, UnitLngt, OtherIt...
dbl (21): tblUploadID, tblHaulID, tblPredatorInformationID, tblPreyInformati...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 12 rows (16%), 62 rows remaining
filter: removed 8,326 rows (17%), 41,844 rows remaining
Rows: 12391 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): Ship, Gear, Time, IdentMet, GravMethod, UnitWgt, UnitLngt, OtherIt...
dbl (21): tblUploadID, tblHaulID, tblPredatorInformationID, tblPreyInformati...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 62 rows (84%), 12 rows remaining
filter: no rows removed
# Old database data (ODB)
ODB <- read_csv(paste0(home, "/data/stomach/StomachDataOld.csv")) # from Stefan via Nis via MaxRows: 574684 Columns: 52
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (16): Dataset, Country, Ship, Estimated_Lat_Lon, ICES_StatRec, Month, Da...
dbl (21): Latitude, Longitude, Year, Depth, Temperature, SampleNo(FishID), I...
lgl (15): RecordType, ICES_AreaCode, Predator_Age(mean), Predator_LowerLengt...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# New SE data
SE <- read_csv(paste0(home, "/data/stomach/full_stomach_data_x.csv")) # From MaxRows: 92868 Columns: 38
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (12): prey_latin_name, predator_code, pred_id, prey_number_type, prey_w...
dbl (25): year, month, pred_weight_g, pred_length_cm, gall_bladder, prey_we...
date (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# BITS survey data to improve coordinates in ODB
# bits_hh <- getDATRAS(record = "HH", survey = "BITS", years = 1963:2022, quarters = c(1,2,3,4))
# write_csv(bits_hh, paste0(home, "/data/DATRAS_exchange/bits_hh.csv"))
bits_hh <- read_csv(paste0(home, "/data/survey/DATRAS_exchange/bits_hh.csv"))Rows: 18105 Columns: 69
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): RecordType, Survey, Country, Ship, Gear, GearEx, StNo, DayNight, S...
dbl (48): Quarter, SweepLngt, HaulNo, Year, Month, Day, TimeShot, DepthStrat...
lgl (8): DoorType, Rigging, Tickler, SecchiDepth, Turbidity, TidePhase, Tid...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# North sea length-weight coefficients
# a and bs for benthic species in the North sea: https://doi.org/10.1017/S0025315409991408
NSab <- read_csv(paste0(home, "/data/stomach/length-weight-Robinson_2010.csv"))New names:
Rows: 216 Columns: 13
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(5): Species, Phylum, Class, Order, Length dbl (7): Min L (mm), Max L (mm), a,
b, r2, P, N lgl (1): ...13
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...13`
2. Prey taxonomic summary and length-weight coefficients for all cod stomach prey
From all unique prey AphiaIDs and Scientific names in the different databases we create a full taxonomic list of Baltic cod prey species using the r package worrms. We then use this to add length weight parameters where possible (a & b from literature, fishbase or estimated), we can then join in accepted Scientific names or AphiaIDs to all prey species in the dbs.
Some AphiaIDs turns out to be wrong though…
# Where we have both AphiaID and Scientificname, we test whether these match in worms using function to get name from aphia ID. When they dont, we that the AID is wrong because the AID should come after the name in the laboratory identification-enter-in-db-process and therefore less likley to be wrong.
# Function for getting AphiaIDs from Scientific names
# getNamefAid <- function(Prey_LatinName, Prey_AphiaID, ...) {
#
# ScientificName = tryCatch(wm_id2name(id = Prey_AphiaID), error = function(e){
# message("Scientific name missing, returning NA:\n", e)
# NA
# }
# )
# return(tibble(ScientificName, Prey_LatinName, Prey_AphiaID)) # preserve ScientificNames for matching in db later
# }
#
# ODBcheck <- ODB |> filter(Predator_LatinName == "Gadus morhua",
# Longitude > 13) |>
# dplyr::select(Prey_LatinName, Prey_AphiaID) |>
# distinct() |>
# pmap(possibly(getNamefAid, otherwise = NULL)) |>
# list_rbind()
#
# ODBcheck |>
# filter(Prey_LatinName != ScientificName)
# For these, we skip the AID and go for Names instead. So if both Prey_LatinName and Prey_AphaID is present, we use name for getting records and LW as and bs.
# An alternative is to get names for all AID:s in the ODB (more than 300 000) and then remove the data where names doesn't match. But, many AphiaIDs are at the rank of e.g. genus och family of a species and is still sort of correct.Get taxonomy
# # pick columns with species related info all datasets
# str(prey) # only AphiaIDPrey
# str(ODB) # Prey_AphiaID and Prey_LatinName
# str(SE) # prey_latin_name
#
# # Remove AphiaIDs in the ODB where both scientific name and AID exists (see above). By doing this, we remove instances where the db_ScientificName is way off the correct AID.
# ODB_b <- ODB|>
# mutate(Prey_AphiaID = if_else(!(is.na(Prey_LatinName) & is.na(Prey_AphiaID)), NA, Prey_AphiaID))
#
# # Collect all AphiaIDs and latin names and clean names
# AIDLnames <- prey |>
# filter(AphiaIDPredator == 126436) |>
# dplyr::select(AphiaIDPrey) |>
# mutate(db_ScientificName = NA) |>
# rename(db_AphiaID = AphiaIDPrey) |>
# bind_rows(SE |>
# filter(predator_code == "COD") |>
# dplyr::select(prey_latin_name) |>
# rename(db_ScientificName = prey_latin_name) |>
# mutate(db_AphiaID = NA)) |>
# bind_rows(ODB_b |>
# filter(Predator_LatinName == "Gadus morhua") |>
# dplyr::select(Prey_LatinName, Prey_AphiaID) |>
# rename(db_ScientificName = Prey_LatinName,
# db_AphiaID = Prey_AphiaID)) |>
# distinct() |>
# filter(!(is.na(db_ScientificName) & is.na(db_AphiaID)))
#
# # We also use Stomach_Item in the ODB where there is prey length or weight but latin name is missing. Add those names to the list
# AIDLnames <- AIDLnames |>
# bind_rows(ODB_b |> filter(is.na(Prey_LatinName), Prey_Length > 0 | !is.na(Prey_Length | Prey_Weight > 0)) |> distinct(Stomach_Item) |> rename(db_ScientificName = Stomach_Item))
#
# AIDLnamesClean <- AIDLnames |>
# mutate(db_CleanScientificName = make_clean_namess(db_ScientificName, allow_dupes = TRUE), # some names cleaning her and below
# db_CleanScientificName = str_to_title(db_CleanScientificName),
# db_CleanScientificName = str_remove_all(db_CleanScientificName,"//."),
# db_CleanScientificName = str_remove(db_CleanScientificName,"2"),
# db_CleanScientificName = str_remove_all(db_CleanScientificName," "),
# db_CleanScientificName = str_remove_all(db_CleanScientificName," "),
# db_CleanScientificName = str_replace_all(db_CleanScientificName,"_", " "),
# db_CleanScientificName = if_else(str_ends(db_CleanScientificName, "sp"), str_remove(db_CleanScientificName,"sp"), db_CleanScientificName),
# db_CleanScientificName = str_replace_all(db_CleanScientificName,"_", " ")) |>
# mutate(db_CleanScientificName = if_else(db_CleanScientificName == "Na", NA, db_CleanScientificName))
#
# # Function for getting AphiaIDs from Scientific names
# getAidfName <- function(db_ScientificName, db_CleanScientificName, ...) {
#
# db_AphiaID = tryCatch(wm_name2id(name = db_CleanScientificName), error = function(e){
# message("Scientific name missing, returning NA:\n", e)
# NA
# }
# )
# return(tibble(db_AphiaID, db_ScientificName, db_CleanScientificName)) # preserve db_ScientificNames for matching in db later
# }
#
# # apply function to ScientificName as AID:s seems to produce more matching records than names. Noe that many generates error: partial content. We need to fix those manually later.
# AIDLnamesClean2 <- AIDLnamesClean |>
# filter(is.na(db_AphiaID)) |>
# pmap(getAidfName) |>
# list_rbind()
#
# AIDLnamesClean3 <- AIDLnamesClean |>
# filter(!is.na(db_AphiaID)) |>
# bind_rows(AIDLnamesClean2)
#
# AIDLnamesClean3 <- AIDLnamesClean3 |>
# distinct() # 51 rows were duplicates, i.e. AID matched with names.
# # now we have a df with all AphiaIDs and Scientific names found in the databases with matching records. From this we can add to the respective database the valid AphiaID for each prey.
# # get records for the prey species list based on AphiaIDs (but remove NAs)
# getRecordsfAid <- function(db_AphiaID, db_ScientificName, db_CleanScientificName, ...) {
# if(is.numeric(db_AphiaID)) { # if db_AphiaID is NA, this is false
# wm_record(id = db_AphiaID) |> tibble(db_AphiaID, db_ScientificName, db_CleanScientificName) }
# else {
# wm_records_common(name = db_CleanScientificName, fuzzy = TRUE) |> tibble(db_AphiaID, db_ScientificName, db_CleanScientificName) }
# }
#
# # get records for the prey species list based on AphiaIDs (but remove NAs)
# preySpecies_records <- AIDLnamesClean3 |>
# pmap(possibly(getRecordsfAid, otherwise = NULL)) |>
# list_rbind()
#
# # preySpecies_records lacks records of 48 db_ScientificNames (where db_AphiaID == NA) so lets try and fix the missing records with a fuzzy join of latin names
# preySpecies_records2 <- AIDLnamesClean3 |>
# filter(is.na(db_AphiaID),
# !is.na(db_CleanScientificName)) |> # one row with all columns NA
# stringdist_left_join(preySpecies_records |> filter(!(is.na(AphiaID) | is.na(valid_name))) |>
# dplyr::select(-db_AphiaID, -db_ScientificName, -db_CleanScientificName),
# by = c(db_CleanScientificName = "valid_name")) |>
# distinct()
#
# # 48 records generated 53 which is good and we can add the non-NAs record to the list but we need to keep the db_Scientificnames so that these can be used to match records later. Theres a few dupes to remove
# preySpecies_records2 |>
# rownames_to_column() |>
# group_by(db_ScientificName) %>%
# filter(n()>1)
# # and there are a few fuzzy joins that didnt end well, Scales are not Fucales, Mucus is not Fucus and Munida is not Mysida.
# preySpecies_records2 |>
# filter(db_ScientificName != scientificname) |>
# dplyr::select(db_ScientificName, scientificname)
#
# # after checking we can remove [c(1,13,29,34,53), ] and then fuzzy-errors and then the missing record rows
# preySpecies_records3 <- preySpecies_records2 |>
# slice(-c(1,13,29,34,53)) |>
# filter(!db_ScientificName %in% c("Scales", "scales", "mucus", "Munida"),
# !is.na(AphiaID))
# # reset "Scales", "scales", "mucus", "Munida" AphiaID to NA
# preySpecies_records2 <- preySpecies_records2 |>
# mutate(AphiaID = if_else(db_ScientificName %in% c("Scales", "scales", "mucus", "Munida"), NA, AphiaID))
#
# # for the remaining non-id:d and, we correct the organic animal prey (reomev plants and trash) manually and the others are NA.
# preySpecies_records2 |> slice(-c(1,13,29,34,53)) |> filter(is.na(AphiaID)) |> pull(db_ScientificName)
#
# preySpecies_records4 <- preySpecies_records2 |> slice(-c(1,13,29,34,53)) |> filter(is.na(db_AphiaID)) |>
# mutate(db_AphiaID = case_when(db_ScientificName %in% c("Pisces/Osteichthyes","Scales","scales", "Unidentified fish","Fish eggs","Pomatoschistus otholyth","Salmon stomach") ~ 152352,
# db_ScientificName %in% c("Stone","Waste","Wood","Algae","stone","wood", "Aglae", "plastic", "carbon","Carbon","Plastic","Plastics","Litter/plastics", "", "Sand","Unidentified algae covered wit","Unidentified mass","Undefined mass") ~ NA,
# db_ScientificName %in% c("remains","mucus","Remains","digestive tract","Unidentified remains","Spawn","Siphon","Entrails","Chicken bone","Zooplancton","Unidentified invertebrata","Unidentified worm") ~ 2,
# db_ScientificName %in% c("Mytilus sp.","Mytilus sp") ~ 138228,
# db_ScientificName == "Idotea sp." ~ 118454,
# db_ScientificName == "Phaeophycophyta" ~ 830,
# db_ScientificName == "Crangon" ~ 107007,
# db_ScientificName == "Pectinaria sp." ~ 129437,
# db_ScientificName == "Gobius niger" ~ 126892,
# db_ScientificName == "Pleuronectoidei" ~ 125579,
# db_ScientificName == "Corystes cassivelanus" ~ 107277,
# db_ScientificName == "Sipunculoidea" ~ 1648,
# db_ScientificName == "Cribrinidae" ~ 267346,
# db_ScientificName == "Spirontocaris securifrons" ~ 107531,
# db_ScientificName == "Spirontocaris affinis" ~ 106994,
# db_ScientificName == "Ciliata mustella" ~ 126448,
# db_ScientificName == "Ophiothricidae" ~ 123208,
# db_ScientificName == "Leptocardia" ~ 104897,
# db_ScientificName == "Brachyrhyncha" ~ 736748,
# db_ScientificName == "Astarte procera" ~ 1735368,
# db_ScientificName == "Paleonemertea" ~ 122307,
# db_ScientificName == "Gadoidei" ~ 125469,
# db_ScientificName %in% c("Clupeoidei","Clupeidae scales") ~ 125464,
# db_ScientificName == "Gammarus sp." ~ 101537,
# db_ScientificName == "Metridiidae" ~ 100678,
# db_ScientificName == "Lampetra" ~ 101167,
# db_ScientificName == "Hyperia" ~ 101796,
# db_ScientificName == "Dromiacea" ~ 106678,
# db_ScientificName == "Lucifer" ~ 106827,
# db_ScientificName %in% c("Galathea sp.","Galathea") ~ 106834,
# db_ScientificName == "Munida" ~ 106835,
# db_ScientificName == "Porcellana" ~ 106838,
# db_ScientificName == "Nephrops" ~ 106863 ,
# db_ScientificName == "Atelecyclus rotundatus" ~ 107273,
# db_ScientificName == "Asterias" ~ 123219,
# db_ScientificName == "Cucumaria" ~ 123479,
# db_ScientificName == "Gadus" ~ 125732,
# db_ScientificName == "Ciliata" ~ 125741,
# db_ScientificName %in% c("Trachurus","Horse mackerel") ~ 125946,
# db_ScientificName == "Trachinus" ~ 126094,
# db_ScientificName == "Owenia" ~ 129427,
# db_ScientificName == "Solea" ~ 126132,
# db_ScientificName == "Pectinaria" ~ 129437,
# db_ScientificName == "Synchaeta" ~ 134958,
# db_ScientificName == "Turritella" ~ 138615,
# db_ScientificName == "Polydora" ~ 129619 ,
# db_ScientificName == "Macropipus dupurator" ~ 156228,
# db_ScientificName == "Gnathostomata" ~ 1828,
# db_ScientificName == "Cardium edule" ~ 1600680,
# db_ScientificName == "Pontophilus gracilis" ~ 246192,
# db_ScientificName == "Pectinaria californiensis" ~ 337505,
# db_ScientificName == "Nereis diversicolor" ~ 152302,
# db_ScientificName == "Chlamys" ~ 138315,
# db_ScientificName == "Asterias rubens" ~ 123776,
# db_ScientificName == "Littorina planaxis" ~ 445651,
# db_ScientificName == "Liparis" ~ 126160,
# db_ScientificName == "Ophiothrix" ~ 123626,
# db_ScientificName == "Anguilla vulgaris" ~ 126281,
# db_ScientificName == "Phyllodoce maculata" ~ 334510,
# db_ScientificName == "Palaemon sp." ~ 107032,
# db_ScientificName == "Pectinaria sp." ~ 129437,
# db_ScientificName == "Astropecten duplicatus" ~ 178650,
# db_ScientificName == "Amphitrite" ~ 129686,
# db_ScientificName == "ASTARTE" ~ 137683,
# db_ScientificName == "Pecten raveneli" ~ 394070,
# db_ScientificName == "Phyllodoce" ~ 129455,
# db_ScientificName == "Terebellidae" ~ 982,
# db_ScientificName == "Herring" ~ 126417,
# db_ScientificName %in% c("Sprat","Sprat eggs","sprat") ~ 126425,
# db_ScientificName %in% c("Cod","Cod Eggs","Cod stomach","Filet of Cod") ~ 126436,
# db_ScientificName == "Four-bearded rockling" ~ 126450,
# db_ScientificName == "Flounder" ~ 125579,
# db_ScientificName == "Whiting" ~ 126438,
# db_ScientificName == "Stickleback" ~ 1617137,
# db_ScientificName == "Nine-spined stickleback" ~ 126507,
# db_ScientificName %in% c("Liocarcinus sp.","liocarcinus sp.") ~ 106925,
# db_ScientificName == "Pagurus sp." ~ 106854,
# db_ScientificName == "Fifteen-spined stickleback" ~ 125777,
# db_ScientificName == "Pasiphaea sp." ~ 107052,
# db_ScientificName == "Zoarces viviparus eggs" ~ 127123,
# db_ScientificName == "Myoxocephalus quadricornis egg" ~ 254529,
# db_ScientificName == "Round goby" ~ 126916,
# db_ScientificName == "Enchelyopus eggs" ~ 125742,
# db_ScientificName == "Lycodes sp." ~ 126104,
# db_ScientificName == "Corophium sp." ~ 101489,
# db_ScientificName == "Cerastoderma sp." ~ 137735,
# db_ScientificName == "Broad-nosed pipefish" ~ 127393,
# db_ScientificName == "Saduria entomon eggs" ~ 119034,
# db_ScientificName == "Macoma sp." ~ 138531,
# db_ScientificName == "Taurulus bubalis eggs" ~ 127204,
# .default = NA),
# db_AphiaID = as.integer(db_AphiaID))
#
# # get records for the fixed AIDs, reuse preySpecies as name for the df to use getRecordsAid() function
# preySpecies_records5 <- preySpecies_records4 |>
# dplyr::select(db_AphiaID, db_ScientificName, db_CleanScientificName) |>
# pmap(possibly(getRecordsfAid, otherwise = NULL)) |>
# list_rbind()
#
# leftover_prey <- preySpecies_records4 |> filter(is.na(db_AphiaID) & is.na(AphiaID))
# # bind preySpeciesrecords with preySpeciesrecords3 and preySpeciesrecords5
#
# preySpecies_records6 <- preySpecies_records |>
# bind_rows(preySpecies_records3, preySpecies_records5, leftover_prey)
#
# preySpecies_records6 |>
# filter(db_CleanScientificName != scientificname) |>
# group_by(db_ScientificName) |>
# filter(n()>1)
#
# # remove the duplicates and give it a final name
# preyTaxamatch <- preySpecies_records6 |>
# filter(!(is.na(db_AphiaID) & db_ScientificName %in% c("Gammarus sp.", "Palaemon sp.")))
#
# saveRDS(preyTaxamatch, paste0(home, "/data/stomach/preyTaxamatch.rds"))
preyTaxamatch <- readRDS(paste0(home, "/data/stomach/preyTaxamatch.rds"))Length-weight parameters (a and b) to prey summary
length-weight relationships retrived from and Fishbase using rfishbase.
# # Using NSab where coefficients are for length in mm while fishbase is for cm. We´ll correct for this below
# preyTaxamatch_Invert <- preyTaxamatch |>
# filter(!phylum == "Chordata",
# !is.na(scientificname)
# ) |>
# stringdist_left_join(NSab |> dplyr::select("Species", "a", "b"), by = c(scientificname = "Species")) |>
# mutate(abTaxlevel = if_else(is.na(a), NA, "species"))
#
# # get mean a and b per order
# NSab_order <- NSab |> summarise(a = mean(a), b = mean(b),
# .by = c(Order))
#
# preyTaxamatch_Invert2 <- preyTaxamatch_Invert |>
# filter(is.na(a)) |> dplyr::select(-a,-b) |>
# left_join(NSab_order |> dplyr::select("Order", "a", "b"), join_by("order"=="Order")) |>
# mutate(abTaxlevel = if_else(is.na(a), NA, "order")) |>
# bind_rows(preyTaxamatch_Invert |> filter(!is.na(a)))
#
# # still more than half of a and bs missing...
# preyTaxamatch_Invert2 |> filter(is.na(a))
#
# # use class instead of order
# NSab_class <- NSab |> summarise(a = mean(a), b = mean(b),
# .by = c(Class))
#
# preyTaxamatch_Invert3 <- preyTaxamatch_Invert2 |>
# filter(is.na(a)) |> dplyr::select(-a,-b) |>
# left_join(NSab_class |> dplyr::select("Class", "a", "b"), join_by("class"=="Class")) |>
# mutate(abTaxlevel = if_else(is.na(a), NA, "class")) |>
# bind_rows(preyTaxamatch_Invert2 |> filter(!is.na(a)))
#
# # still lots (122) missing, but many classes are maybe not length-weight realtionship applicable, such as malcostraca, bivalvia and some other molluscs?
# preyTaxamatch_Invert3 |> filter(is.na(a))
#
# # use phylum instead of class
# NSab_phylum <- NSab |> summarise(a = mean(a), b = mean(b),
# .by = c(Phylum))
#
# preyTaxamatch_Invert4 <- preyTaxamatch_Invert3 |>
# filter(is.na(a)) |> dplyr::select(-a,-b) |>
# left_join(NSab_phylum |> dplyr::select("Phylum", "a", "b"), join_by("phylum"=="Phylum")) |>
# mutate(abTaxlevel = if_else(is.na(a), NA, "phylum")) |>
# bind_rows(preyTaxamatch_Invert3 |> filter(!is.na(a)))
#
# # still some missing
# preyTaxamatch_Invert4 |> filter(is.na(a))
# # Ok, thats enough, ever tried to messure a Priapulid?
#
# # The NSabs are fitted for animals meassured in mm which they are not for fishbase (FSab),
# preyTaxamatch_Invert4 <- preyTaxamatch_Invert4 |>
# mutate(a = a*(1/10)^b) # rescale since a and b are estimated in mm following the rule (ab)^c = a^c*b^c a so that we can break out 1/10 (mm to cm) and add it to the a scalar
#
# # Now Chordates (i.e. fish) by using package rfishbase
# #length_weight("Gadus morhua")
# #as the rfishbase function length_weight returns multimple observations, we have to make a function # that both gets and summarises those
# getabsChord <- function(nam, ind) {
#
# lwab = length_weight(nam) |>
# filter(Type == "TL") |> # filter out total length measurements
# summarise(a = mean(a), b = mean(b))
#
# bind_cols(preyTaxamatch_Chord[ind,], lwab)
# }
#
# # then we can map that function with a furrr::future to improve speed as length_weight takes a while to run
# preyTaxamatch_Chord <- preyTaxamatch |> filter(phylum == "Chordata")
#
# plan(multisession, workers = 3) # saves 40% time in a small test
# preyTaxamatch_Chord <- preyTaxamatch_Chord |>
# pull(valid_name) |>
# future_imap(\(x, idx) getabsChord(x, idx), .options = furrr_options(seed = T)) |>
# list_rbind()
# plan(sequential)
#
# preyTaxamatch_Chord <- preyTaxamatch_Chord |>
# mutate(abTaxlevel = if_else(is.na(a), NA, "species"))
#
# # lots of as and bs missing
# preyTaxamatch_Chord |> filter(is.na(a))
#
# # use family instead of genus
# FBab_family <- preyTaxamatch_Chord |>
# filter(!is.na(a)) |>
# summarise(a = mean(a), b = mean(b), .by = c(family))
#
# preyTaxamatch_Chord2 <- preyTaxamatch_Chord |>
# filter(is.na(a)) |>
# dplyr::select(-a,-b) |>
# left_join(FBab_family |> dplyr::select("family", "a", "b")) |>
# mutate(abTaxlevel = if_else(is.na(a), NA, "family")) |>
# bind_rows(preyTaxamatch_Chord |> filter(!is.na(a)))
#
# # left over fish and tunicates
# preyTaxamatch_Chord2 |> filter(is.na(a))
#
# # lets use mean a and b for the chordates for these
# cho_meanab <- preyTaxamatch_Chord2 |>
# summarise(mean_a = mean(a, na.rm=TRUE), mean_b = mean(b, na.rm=TRUE))
#
# preyTaxamatch_Chord2 <- preyTaxamatch_Chord2 |>
# mutate(a = if_else(is.na(a), cho_meanab$mean_a, a),
# b = if_else(is.na(b), cho_meanab$mean_b, b))
#
# # Join in join in invertebrates
# preyTaxamatch_LW <- preyTaxamatch_Chord2 |>
# bind_rows(preyTaxamatch_Invert4 |> dplyr::select(-Species))
#
# preyTaxamatch_LW |>
# summarise(meana = mean(a, na.rm = TRUE), by = abTaxlevel)
#
# saveRDS(preyTaxamatch_LW, paste0(home, "/data/stomach/preyTaxamatch_LW.rds"))
preyTaxamatch_LW <- readRDS(paste0(home, "/data/stomach/preyTaxamatch_LW.rds"))
preyTaxamatch_LW |>
filter(is.na(a))filter: removed 568 rows (94%), 36 rows remaining
# A tibble: 36 × 33
AphiaID url scientificname authority status unacceptreason taxonRankID
<int> <chr> <chr> <chr> <chr> <chr> <int>
1 101156 https://w… Halicryptus s… von Sieb… accep… <NA> 220
2 101160 https://w… Priapulus cau… Lamarck,… accep… <NA> 220
3 144129 https://w… Fucus Linnaeus… accep… <NA> 180
4 101063 https://w… Priapulida Théel, 1… accep… <NA> 30
5 799 https://w… Nematoda <NA> accep… <NA> 30
6 1248 https://w… Ctenophora Eschscho… accep… <NA> 30
7 101156 https://w… Halicryptus s… von Sieb… accep… <NA> 220
8 101093 https://w… Halicryptus von Sieb… accep… <NA> 180
9 101160 https://w… Priapulus cau… Lamarck,… accep… <NA> 220
10 101156 https://w… Halicryptus s… von Sieb… accep… <NA> 220
# ℹ 26 more rows
# ℹ 26 more variables: rank <chr>, valid_AphiaID <int>, valid_name <chr>,
# valid_authority <chr>, parentNameUsageID <int>, kingdom <chr>,
# phylum <chr>, class <chr>, order <chr>, family <chr>, genus <chr>,
# citation <chr>, lsid <chr>, isMarine <int>, isBrackish <int>,
# isFreshwater <int>, isTerrestrial <int>, isExtinct <int>, match_type <chr>,
# modified <chr>, db_AphiaID <dbl>, db_ScientificName <chr>, …
preyTaxamatch_LW |>
filter(!is.na(a)) |>
pivot_longer(cols = c("a", "b"), values_to = "value", names_to = "params") |>
ggplot() +
geom_histogram(aes(value, fill = abTaxlevel)) + #, bins = 100) +
facet_grid(params~abTaxlevel) +
guides(fill="none")filter: removed 36 rows (6%), 568 rows remaining
pivot_longer: reorganized (a, b) into (params, value) [was 568x33, now 1136x33]
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
# two groupings of a and bs in the histograms above are due to phylum I think, fish have small a and invertebrates have a:s around 23
preyTaxamatch_LW |>
filter(phylum %in% c("Chordata", "Arthropoda")) |>
summarise(mean_a = mean(a, na.rm = TRUE), mean_b = mean(b, na.rm = TRUE), .by = phylum)filter: removed 195 rows (32%), 409 rows remaining
summarise: now 2 rows and 3 columns, ungrouped
# A tibble: 2 × 3
phylum mean_a mean_b
<chr> <dbl> <dbl>
1 Chordata 0.00947 3.03
2 Arthropoda 0.0541 2.71
3. New database data
names(fi)[1] "tblUploadID" "Country" "Reporting_organisation"
[4] "CruiseID"
names(hi) [1] "tblUploadID" "tblHaulID" "Ship" "Gear"
[5] "HaulNo" "StationNumber" "Year" "Month"
[9] "Day" "Time" "ShootLat" "ShootLong"
[13] "HaulLat" "HaulLong" "ICESrectangle" "Depth"
[17] "Survey" "ICESDatabase" "Notes"
names(pred) [1] "tblUploadID" "tblHaulID"
[3] "tblPredatorInformationID" "Ship"
[5] "Gear" "HaulNo"
[7] "StationNumber" "Year"
[9] "Month" "Day"
[11] "Time" "FishID"
[13] "AphiaIDPredator" "IndWgt"
[15] "Number" "MeasurementIncrement"
[17] "Length" "Code"
[19] "Age" "Sex"
[21] "MaturityScale" "MaturityStage"
[23] "PreservationMethod" "Regurgitated"
[25] "StomachFullness" "FullStomWgt"
[27] "EmptyStomWgt" "StomachEmpty"
[29] "GenSamp" "Notes"
names(prey) [1] "tblUploadID" "tblHaulID"
[3] "tblPredatorInformationID" "tblPreyInformationID"
[5] "Ship" "Gear"
[7] "HaulNo" "StationNumber"
[9] "Year" "Month"
[11] "Day" "Time"
[13] "FishID" "AphiaIDPredator"
[15] "AphiaIDPrey" "IdentMet"
[17] "DigestionStage" "GravMethod"
[19] "SubFactor" "PreySequence"
[21] "Count" "UnitWgt"
[23] "Weight" "UnitLngt"
[25] "Length" "OtherItems"
[27] "OtherCount" "OtherWgt"
[29] "AnalysingOrg" "Notes"
Join all data files (new db) in a specific order: fi -> hi -> pred -> prey.
For some joins, there are multiple column names shared in addition to the key that I could remove and keep only the ID key and the non-shared columns but instead I keep them. First I need to ensure they are the same (VT what is the same and how do you do this?), and not only have the same name. Will also check if both datasets have the same amount of NA before choosing which column to carry from which dataset.
hifi <- left_join(hi, fi, by = "tblUploadID") # join haul data with file infoleft_join: added 3 columns (Country, Reporting_organisation, CruiseID)
> rows only in hi 0
> rows only in fi ( 0)
> matched rows 1,675
> =======
> rows total 1,675
comcol_hifi_pred <- intersect(colnames(pred), colnames(hifi))
# Check if any of the two datasets have NA in the common columns which can screw up joining dfs
unique(is.na(hifi |> dplyr::select(all_of(comcol_hifi_pred)))) tblUploadID tblHaulID Ship Gear HaulNo StationNumber Year Month Day
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Time Notes
[1,] FALSE TRUE
[2,] FALSE FALSE
unique(is.na(pred |> dplyr::select(all_of(comcol_hifi_pred)))) tblUploadID tblHaulID Ship Gear HaulNo StationNumber Year Month Day
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Time Notes
[1,] FALSE TRUE
[2,] FALSE FALSE
# The column Notes does have NAs and different meanings in hi and fi so we will remove Notes before joining
pred <- left_join(pred |> dplyr::select(-Notes), # join in predator data
hifi |> dplyr::select(-Notes),
by = comcol_hifi_pred[!comcol_hifi_pred == "Notes"])left_join: added 11 columns (ShootLat, ShootLong, HaulLat, HaulLong, ICESrectangle, …)
> rows only in dplyr::select(pred, -No.. 0
> rows only in dplyr::select(hifi, -No.. ( 0)
> matched rows 39,277
> ========
> rows total 39,277
Fix pred regurgitated stomachs
# In the Swedish data, 2 means regurgitated and 1 is intact, but for the rest, 1 means regurgitated, 0 or NA means intact. All NA values are from 2020 and 2021 and for all regurguitated=0 before 2005 (see Neuenfeldt 2020 discussion, https://doi.org/10.1093/icesjms/fsz224). So we dont know if they are truly not regurgitated. Therefore there is not much info of value from the regurgitated column.
pred |>
filter(Country == "SE") |>
mutate(regurg_f = as.factor(Regurgitated)) |>
summarise(regy = n(), .by = c(Year, regurg_f))filter: removed 38,653 rows (98%), 624 rows remaining
mutate: new variable 'regurg_f' (factor) with 3 unique values and 8% NA
summarise: now 3 rows and 3 columns, ungrouped
# A tibble: 3 × 3
Year regurg_f regy
<dbl> <fct> <int>
1 2021 1 572
2 2021 <NA> 48
3 2021 2 4
pred <- pred |>
mutate(Regurgitated_st = Regurgitated,
Regurgitated_st = if_else(Country == "SE" & Regurgitated == 1, 0, Regurgitated_st),
Regurgitated_st = if_else(Country == "SE" & Regurgitated == 2, 1, Regurgitated_st),
Regurgitated_st = replace_na(Regurgitated_st, 0))mutate: new variable 'Regurgitated_st' (double) with 2 unique values and 0% NA
# Regurgitated stomachs are only about 2% and come from 2021, 2018 and 2013 (and 1 in 1981).
pred |>
mutate(Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle)) |>
summarise(count = n(), .by = c(Regurgitated_st, Year)) |>
filter(Regurgitated_st == 1)mutate: new variable 'Haul_ID' (character) with 1,674 unique values and 0% NA
summarise: now 55 rows and 3 columns, ungrouped
filter: removed 48 rows (87%), 7 rows remaining
# A tibble: 7 × 3
Regurgitated_st Year count
<dbl> <dbl> <int>
1 1 1981 1
2 1 2013 12
3 1 2021 368
4 1 2014 129
5 1 2017 53
6 1 2018 195
7 1 2016 145
# There are regurgitated stomachs (column in pred) that have information in prey info, i.e. either its incorrect or they have signs of regurgitation but prey in the stomach. These are various prey types from 2018 to 2021. Lets remove these for the rpw analyses.
regurg_ids <- pred |>
filter(Regurgitated_st == 1) filter: removed 38,374 rows (98%), 903 rows remaining
prey |> filter(tblPredatorInformationID %in% regurg_ids$tblPredatorInformationID) |> distinct(Weight, AphiaIDPrey, Year) |> as.data.frame()filter: removed 53,890 rows (99%), 345 rows remaining
distinct: removed 338 rows (98%), 7 rows remaining
Weight AphiaIDPrey Year
1 NA NA 2013
2 NA NA 2021
3 0.07 125464 2021
4 NA NA 2014
5 0.17 293511 2014
6 NA NA 2017
7 NA NA 2016
# Remove regurgitated stomachs from pred and prey to reduce issues with the relative prey weight becoming incorrect.
pred2 <- pred |>
filter(!tblPredatorInformationID %in% regurg_ids$tblPredatorInformationID)filter: removed 903 rows (2%), 38,374 rows remaining
prey2 <- prey |>
filter(!tblPredatorInformationID %in% regurg_ids$tblPredatorInformationID)filter: removed 345 rows (1%), 53,890 rows remaining
Join in prey data
Now join predator data to prey data following the same procedure.
intersect(colnames(pred), colnames(prey)) [1] "tblUploadID" "tblHaulID"
[3] "tblPredatorInformationID" "Ship"
[5] "Gear" "HaulNo"
[7] "StationNumber" "Year"
[9] "Month" "Day"
[11] "Time" "FishID"
[13] "AphiaIDPredator" "Length"
# Length is a common column, but it corresponds to predator or prey. Rename!
pred3 <- pred2 |> rename(pred_length = Length,
pred_weight = IndWgt)rename: renamed 2 variables (pred_weight, pred_length)
prey3 <- prey2 |> rename(prey_length = Length,
prey_weight = Weight)rename: renamed 2 variables (prey_weight, prey_length)
comcol_prey_pred <- intersect(colnames(pred3), colnames(prey3))
# Check if any of the two datasets have NA in the common columns
unique(is.na(pred3 |> dplyr::select(all_of(comcol_prey_pred)))) tblUploadID tblHaulID tblPredatorInformationID Ship Gear HaulNo
[1,] FALSE FALSE FALSE FALSE FALSE FALSE
StationNumber Year Month Day Time FishID AphiaIDPredator
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
unique(is.na(prey3 |> dplyr::select(all_of(comcol_prey_pred)))) tblUploadID tblHaulID tblPredatorInformationID Ship Gear HaulNo
[1,] FALSE FALSE FALSE FALSE FALSE FALSE
StationNumber Year Month Day Time FishID AphiaIDPredator
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Rename "Notes" in prey data to avoid confusion as to which dataset it belongs to
prey3 <- prey3 |> rename(prey_notes = Notes)rename: renamed one variable (prey_notes)
# Join in pred3 info into prey. There are rows only in pred3 (y) that are not in prey. These are either empty, regurgitated or incorrect. Empties will be added later. We go from 50 000 prey to 62 000 obsevervations
new_db <- left_join(pred3, prey3, by = c(comcol_prey_pred))left_join: added 17 columns (tblPreyInformationID, AphiaIDPrey, IdentMet, DigestionStage, GravMethod, …)
> rows only in pred3 12,026
> rows only in prey3 ( 0)
> matched rows 53,890 (includes duplicates)
> ========
> rows total 65,916
Fix prey weights etc
Calculate total weight of specific prey species by unique predator ID. For NA and zero weights, we estimate weight if length is present. If length is not NA and Weight is 0 or NA, estimate weight based on length and Count. Else give weight NA and drop it. Because these are not true empty, else there wouldn’t be species-information
# Estimate weight based on count. In the data, if count is >1, the weight is grouped. In some cases, all of Weight, Count and prey_length is NA, i.e. there are presences of prey but no information for calculating the total weight or there is a length but no count (0) which we will treat as one and retrieve a weight, I think this is less wrong.
# NA length unit matches the number of missing lengths. Where we don't have a unit is where length is lacking.
sum(is.na(new_db$UnitLngt))/length(new_db$UnitLngt)[1] 0.6835518
sum(is.na(new_db$prey_length))/length(new_db$prey_length)[1] 0.6878603
# we should end up with a bit less than 50 000 prey items:
new_db |> filter(is.na(prey_weight) & is.na(Count) & is.na(prey_length))filter: removed 52,606 rows (80%), 13,310 rows remaining
# A tibble: 13,310 × 58
tblUploadID tblHaulID tblPredatorInformati…¹ Ship Gear HaulNo StationNumber
<dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 8247 4705 55121 90MZ LBT 3 3
2 8247 4706 55123 90MZ LBT 4 3
3 8247 4707 55133 90MZ LBT 5 3
4 8247 4707 55141 90MZ LBT 5 3
5 8247 4707 55155 90MZ LBT 5 3
6 8247 4707 55156 90MZ LBT 5 3
7 8247 4691 55162 90MZ LBT 6 3
8 8247 4691 55171 90MZ LBT 6 3
9 8247 4691 55172 90MZ LBT 6 3
10 8247 4691 55174 90MZ LBT 6 3
# ℹ 13,300 more rows
# ℹ abbreviated name: ¹tblPredatorInformationID
# ℹ 51 more variables: Year <dbl>, Month <dbl>, Day <dbl>, Time <chr>,
# FishID <dbl>, AphiaIDPredator <dbl>, pred_weight <dbl>, Number <dbl>,
# MeasurementIncrement <dbl>, pred_length <dbl>, Code <chr>, Age <dbl>,
# Sex <chr>, MaturityScale <chr>, MaturityStage <dbl>,
# PreservationMethod <chr>, Regurgitated <dbl>, StomachFullness <lgl>, …
#prey |> filter(is.na(prey_weight) & is.na(Count) & is.na(prey_length))
new_db |> # we assume that NA in Count equals 1 based on this freq distribution of 1 and NA
summarise(n= n(), .by = Count) |>
arrange(-n)summarise: now 231 rows and 2 columns, ungrouped
# A tibble: 231 × 2
Count n
<dbl> <int>
1 NA 31103
2 1 25755
3 2 2692
4 3 1349
5 4 770
6 5 566
7 6 451
8 7 300
9 8 238
10 9 198
# ℹ 221 more rows
# compare weight of count = 1 and count = NA
new_db |> # The lower mean weight is probably becasue the prey was not intact.
filter(is.na(Count) | Count == 1) |>
mutate(Count = replace_na(Count, -1)) |>
filter(prey_weight > 0) |>
summarise(mean = mean(prey_weight), med = median(prey_weight),
.by = Count)filter: removed 9,058 rows (14%), 56,858 rows remaining
mutate: changed 31,103 values (55%) of 'Count' (31,103 fewer NAs)
filter: removed 13,441 rows (24%), 43,417 rows remaining
summarise: now 2 rows and 3 columns, ungrouped
# A tibble: 2 × 3
Count mean med
<dbl> <dbl> <dbl>
1 -1 1.60 0.224
2 1 4.94 1.02
# Species specific length-weights based on preyTaxamatch_LW
# Add a and bs for length weight relationships. Here we may have several as and bs at different taxonomic levels (abTaxlevel). We arrane those so that distinct prioritizes e.g. species over order.
Taxonomic_hierarchy <- c("species","family","class", "order", "phylum", NA)
new_db_preyTaxamatch_LW <- preyTaxamatch_LW |>
dplyr::select(valid_AphiaID,valid_name,kingdom,phylum,class,order,family,genus,db_AphiaID,db_ScientificName,db_CleanScientificName,abTaxlevel,a,b) |>
mutate(abTaxlevel = factor(abTaxlevel, Taxonomic_hierarchy)) |>
arrange(db_AphiaID, abTaxlevel) |>
distinct(db_AphiaID, .keep_all = TRUE)mutate: converted 'abTaxlevel' from character to factor (0 new NA)
distinct: removed 123 rows (20%), 481 rows remaining
# join in as and bs and taxonomy
new_db2 <- new_db |> # note necessary argument na_matches
left_join(new_db_preyTaxamatch_LW, join_by("AphiaIDPrey"=="db_AphiaID"), na_matches = "never")left_join: added 13 columns (valid_AphiaID, valid_name, kingdom, phylum, class, …)
> rows only in new_db 0
> rows only in new_db_preyTaxamatch_LW ( 398)
> matched rows 65,916
> ========
> rows total 65,916
# we lack a and b for few species when phylum as a taxonomic level is used but when a and b is missing (when abTaxlevel is NA), we use a = 0.01 and b = 3
new_db2 |>
summarise(n = n(), mean_a = mean(a, na.rm = TRUE), median_a = median(a, na.rm = TRUE), mean_b = mean(b, na.rm = TRUE), median_b = median(b, na.rm = TRUE), .by = abTaxlevel)summarise: now 6 rows and 6 columns, ungrouped
# A tibble: 6 × 6
abTaxlevel n mean_a median_a mean_b median_b
<fct> <int> <dbl> <dbl> <dbl> <dbl>
1 phylum 20489 0.0421 0.0422 2.75 2.75
2 order 13739 0.114 0.0704 2.35 2.49
3 <NA> 15820 0.00947 0.00947 3.03 3.03
4 species 13780 0.0156 0.00674 2.98 3.05
5 family 2042 0.00988 0.0107 3.12 3.13
6 class 46 0.0519 0.0529 2.63 2.62
# Prepare variables and calculate weight (total and individual) of prey given count and prey length
new_db3 <- new_db2 |>
mutate(prey_length = if_else(UnitLngt == "mm" & prey_length >= 0, prey_length/10, prey_length, missing = NA), # make cm
prey_weight = replace_na(prey_weight, -9),
Count = replace_na(Count, -9),
prey_length = replace_na(prey_length, -9),
Count = ifelse(Count <= 0 & (prey_length > 0 | prey_weight > 0), 1, Count), # When Count is Na (-9) or 0 and there is a weight or length, replace with 1.
prey_weight_source = if_else(prey_weight <= 0 & prey_length > 0 & Count > 0, "estimated", "observed", missing = NA), # Note that -9 length are gets "observed", i.e we will not estimate those weights.
prey_weight = if_else(prey_weight_source == "estimated",
if_else(is.na(a) & is.na(b), (0.01*(prey_length)^3)*Count, (a*prey_length^b)*Count),
prey_weight, missing = -9),
prey_ind_weight = if_else(prey_weight > 0 & Count > 0, prey_weight / Count, NA)) |>
dplyr::select(-UnitLngt) # Since this is no longer valid when we changed scale abovemutate: changed 31,129 values (47%) of 'Count' (31,103 fewer NAs)
changed 13,463 values (20%) of 'prey_weight' (13,463 fewer NAs)
changed 51,932 values (79%) of 'prey_length' (45,341 fewer NAs)
new variable 'prey_weight_source' (character) with 2 unique values and 0% NA
new variable 'prey_ind_weight' (double) with 5,981 unique values and 20% NA
new_db3 |> # some unreasonable weights but not too much
filter(prey_weight > 0) |>
ggplot(aes(prey_weight)) +
geom_histogram() +
facet_wrap(~abTaxlevel, scales = "free")filter: removed 13,369 rows (20%), 52,547 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
# # Even if we estimate the prey weight based on the length of the prey and the number of the prey, we still have 151 rows that Weight == -9 or 0. We can give them the average weight but: While this may produce outliers when the predator is small compared to the avg prey size, these would be removed in a later cleaning stage. Lets gove them average weights.
new_db3 |> # Of course there are extreme weights that should not be included, these will be removed when we filter out unreasonable rpw:s (>1).
filter(prey_weight > 0 & !is.na(AphiaIDPrey)) |>
ggplot(aes(prey_ind_weight)) +
facet_wrap(~abTaxlevel, scales = "free") +
geom_histogram()filter: removed 14,558 rows (22%), 51,358 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
new_db3 |> # 893 g sprat not possible
filter(AphiaIDPrey %in% c(293743, 1625944, 126425, 322683, 236448),
prey_ind_weight > 200) |> as.data.frame()filter: removed 65,914 rows (>99%), 2 rows remaining
tblUploadID tblHaulID tblPredatorInformationID Ship Gear HaulNo StationNumber
1 8319 6300 93265 67BC TVL 27 1
2 8323 6486 96765 LA99 TVS 12 1
Year Month Day Time FishID AphiaIDPredator pred_weight Number
1 2008 3 11 0625 35 126436 695.0 1
2 2004 3 14 0520 157 126436 188.5 1
MeasurementIncrement pred_length Code Age Sex MaturityScale MaturityStage
1 1 40 <NA> NA M M6 62
2 1 27 <NA> NA M M6 62
PreservationMethod Regurgitated StomachFullness FullStomWgt EmptyStomWgt
1 NBF 0 NA NA NA
2 NBF 0 NA NA NA
StomachEmpty GenSamp ShootLat ShootLong HaulLat HaulLong ICESrectangle Depth
1 0 N 56.2717 19.6533 NA NA 41G9 73
2 0 N 56.5833 20.5667 NA NA 42H0 63
Survey ICESDatabase Country Reporting_organisation CruiseID
1 <NA> Y LV 3140 LV314067BC2008
2 <NA> Y LV 3140 LV3140LA992004
Regurgitated_st tblPreyInformationID AphiaIDPrey IdentMet DigestionStage
1 0 143163 126425 VISO 1
2 0 147493 126425 VISO 1
GravMethod SubFactor PreySequence Count UnitWgt prey_weight prey_length
1 WETWT NA 2 1 g 893 10.5
2 WETWT NA 2 1 g 324 -9.0
OtherItems OtherCount OtherWgt AnalysingOrg prey_notes valid_AphiaID
1 <NA> NA NA 3140 <NA> 126425
2 <NA> NA NA 3140 <NA> 126425
valid_name kingdom phylum class order family genus
1 Sprattus sprattus Animalia Chordata Teleostei Clupeiformes Clupeidae Sprattus
2 Sprattus sprattus Animalia Chordata Teleostei Clupeiformes Clupeidae Sprattus
db_ScientificName db_CleanScientificName abTaxlevel a b
1 <NA> <NA> species 0.006736452 3.050674
2 <NA> <NA> species 0.006736452 3.050674
prey_weight_source prey_ind_weight
1 observed 893
2 observed 324
new_db3 |> # Saduria above 10 g is highly unlikely
filter(AphiaIDPrey %in% c(293511, 119034),
prey_ind_weight > 10) |> as.data.frame() |>
arrange(prey_ind_weight) |>
slice(1:20)filter: removed 65,300 rows (99%), 616 rows remaining
slice: removed 596 rows (97%), 20 rows remaining
tblUploadID tblHaulID tblPredatorInformationID Ship Gear HaulNo
1 8321 6380 94535 67BC TVL 1
2 8468 19421 119073 LABS LBT 3
3 8307 6097 88591 90MZ LBT 36
4 8307 6097 88591 90MZ LBT 36
5 8470 19473 121829 LA99 LBT 2
6 8470 19473 121829 LA99 LBT 2
7 8470 19473 121829 LA99 LBT 2
8 8321 6382 94630 67BC TVL 2
9 8468 19422 119166 LABS LBT 4
10 8468 19419 119007 LABS LBT 1
11 8468 19422 119123 LABS LBT 4
12 8305 5992 83228 90MZ LBT 21
13 8305 5992 83228 90MZ LBT 21
14 8305 5992 83228 90MZ LBT 21
15 8305 5992 83228 90MZ LBT 21
16 8305 5992 83228 90MZ LBT 21
17 8305 5992 83228 90MZ LBT 21
18 8305 5992 83228 90MZ LBT 21
19 8468 19423 119209 LABS LBT 5
20 8324 6514 97457 LA99 TVS 9
StationNumber Year Month Day Time FishID AphiaIDPredator pred_weight Number
1 1 2006 3 5 0955 28 126436 1325 1
2 2 1967 2 9 1435 7 126436 600 1
3 4 1965 7 22 0900 73 126436 1200 1
4 4 1965 7 22 0900 73 126436 1200 1
5 1 1968 1 13 0700 9 126436 1700 1
6 1 1968 1 13 0700 9 126436 1700 1
7 1 1968 1 13 0700 9 126436 1700 1
8 1 2006 3 5 1145 6 126436 830 1
9 2 1967 2 9 1330 47 126436 1013 1
10 2 1967 2 9 0955 54 126436 670 1
11 2 1967 2 9 1330 4 126436 790 1
12 2 1964 4 27 0700 3 126436 800 1
13 2 1964 4 27 0700 3 126436 800 1
14 2 1964 4 27 0700 3 126436 800 1
15 2 1964 4 27 0700 3 126436 800 1
16 2 1964 4 27 0700 3 126436 800 1
17 2 1964 4 27 0700 3 126436 800 1
18 2 1964 4 27 0700 3 126436 800 1
19 2 1967 2 10 0645 20 126436 640 1
20 2 2003 11 22 1050 10 126436 1630 1
MeasurementIncrement pred_length Code Age Sex MaturityScale MaturityStage
1 1 50 <NA> NA F M6 62
2 1 41 <NA> NA M M6 62
3 1 49 <NA> NA F M6 65
4 1 49 <NA> NA F M6 65
5 1 54 <NA> NA F M6 62
6 1 54 <NA> NA F M6 62
7 1 54 <NA> NA F M6 62
8 1 43 <NA> NA M M6 62
9 1 48 <NA> NA F M6 65
10 1 44 <NA> NA F M6 62
11 1 44 <NA> NA F M6 62
12 1 42 <NA> NA F M6 62
13 1 42 <NA> NA F M6 62
14 1 42 <NA> NA F M6 62
15 1 42 <NA> NA F M6 62
16 1 42 <NA> NA F M6 62
17 1 42 <NA> NA F M6 62
18 1 42 <NA> NA F M6 62
19 1 40 <NA> NA F M6 62
20 1 58 <NA> NA F M6 64
PreservationMethod Regurgitated StomachFullness FullStomWgt EmptyStomWgt
1 NBF 0 NA NA NA
2 NBF 0 NA NA NA
3 NBF 0 NA NA NA
4 NBF 0 NA NA NA
5 NBF 0 NA NA NA
6 NBF 0 NA NA NA
7 NBF 0 NA NA NA
8 NBF 0 NA NA NA
9 NBF 0 NA NA NA
10 NBF 0 NA NA NA
11 NBF 0 NA NA NA
12 NBF 0 NA NA NA
13 NBF 0 NA NA NA
14 NBF 0 NA NA NA
15 NBF 0 NA NA NA
16 NBF 0 NA NA NA
17 NBF 0 NA NA NA
18 NBF 0 NA NA NA
19 NBF 0 NA NA NA
20 NBF 0 NA NA NA
StomachEmpty GenSamp ShootLat ShootLong HaulLat HaulLong ICESrectangle Depth
1 0 N 55.7117 19.9983 NA NA 40G9 74
2 0 N 55.9833 18.5333 NA NA 40G8 110
3 0 N 56.4166 20.0833 NA NA 41H0 80
4 0 N 56.4166 20.0833 NA NA 41H0 80
5 0 N 56.5833 20.7500 NA NA 42H0 40
6 0 N 56.5833 20.7500 NA NA 42H0 40
7 0 N 56.5833 20.7500 NA NA 42H0 40
8 0 N 55.6766 20.0883 NA NA 40H0 76
9 0 N 55.7500 18.5833 NA NA 40G8 110
10 0 N 55.8666 18.9499 NA NA 40G8 97
11 0 N 55.7500 18.5833 NA NA 40G8 110
12 0 N 55.5833 20.2500 NA NA 40H0 74
13 0 N 55.5833 20.2500 NA NA 40H0 74
14 0 N 55.5833 20.2500 NA NA 40H0 74
15 0 N 55.5833 20.2500 NA NA 40H0 74
16 0 N 55.5833 20.2500 NA NA 40H0 74
17 0 N 55.5833 20.2500 NA NA 40H0 74
18 0 N 55.5833 20.2500 NA NA 40H0 74
19 0 N 55.7000 20.3000 NA NA 40H0 75
20 0 N 55.8667 20.0833 NA NA 40H0 60
Survey ICESDatabase Country Reporting_organisation CruiseID
1 <NA> Y LV 3140 LV314067BC2006
2 <NA> N LV 3140 LV3140LABS1967
3 <NA> N LV 3140 LV314090MZ1965
4 <NA> N LV 3140 LV314090MZ1965
5 <NA> N LV 3140 LV3140LA991968
6 <NA> N LV 3140 LV3140LA991968
7 <NA> N LV 3140 LV3140LA991968
8 <NA> Y LV 3140 LV314067BC2006
9 <NA> N LV 3140 LV3140LABS1967
10 <NA> N LV 3140 LV3140LABS1967
11 <NA> N LV 3140 LV3140LABS1967
12 <NA> N LV 3140 LV314090MZ1964
13 <NA> N LV 3140 LV314090MZ1964
14 <NA> N LV 3140 LV314090MZ1964
15 <NA> N LV 3140 LV314090MZ1964
16 <NA> N LV 3140 LV314090MZ1964
17 <NA> N LV 3140 LV314090MZ1964
18 <NA> N LV 3140 LV314090MZ1964
19 <NA> N LV 3140 LV3140LABS1967
20 <NA> Y LV 3140 LV3140LA992003
Regurgitated_st tblPreyInformationID AphiaIDPrey IdentMet DigestionStage
1 0 144545 119034 VISO 1
2 0 195122 119034 VISO 1
3 0 136833 119034 VISO 1
4 0 136834 119034 VISO 1
5 0 198804 119034 VISO 1
6 0 198805 119034 VISO 1
7 0 198806 119034 VISO 1
8 0 144686 119034 VISO 1
9 0 195014 119034 VISO 1
10 0 194924 119034 VISO 1
11 0 194977 119034 VISO 1
12 0 123124 119034 VISO 1
13 0 123125 119034 VISO 1
14 0 123126 119034 VISO 1
15 0 123127 119034 VISO 1
16 0 123128 119034 VISO 1
17 0 123129 119034 VISO 1
18 0 123130 119034 VISO 1
19 0 195175 119034 VISO 1
20 0 148288 119034 VISO 1
GravMethod SubFactor PreySequence Count UnitWgt prey_weight prey_length
1 WETWT NA 1 1 g 10.01 -9.0
2 WETWT NA 1 1 g 10.04 -9.0
3 WETWT NA 1 1 g 10.10 6.0
4 WETWT NA 2 1 g 10.10 5.9
5 WETWT NA 2 1 g 10.10 5.6
6 WETWT NA 3 1 g 10.10 3.2
7 WETWT NA 4 1 g 10.10 2.9
8 WETWT NA 2 1 g 10.15 -9.0
9 WETWT NA 1 1 g 10.19 -9.0
10 WETWT NA 1 1 g 10.20 -9.0
11 WETWT NA 1 1 g 10.20 -9.0
12 WETWT NA 3 1 g 10.34 5.6
13 WETWT NA 4 1 g 10.34 3.9
14 WETWT NA 5 1 g 10.34 3.8
15 WETWT NA 6 1 g 10.34 4.0
16 WETWT NA 7 1 g 10.34 4.0
17 WETWT NA 8 1 g 10.34 4.2
18 WETWT NA 9 1 g 10.34 4.6
19 WETWT NA 1 1 g 10.40 -9.0
20 WETWT NA 3 1 g 10.44 6.3
OtherItems OtherCount OtherWgt AnalysingOrg prey_notes valid_AphiaID
1 <NA> NA NA 3140 <NA> 119034
2 <NA> NA NA 3140 <NA> 119034
3 <NA> NA NA 3140 <NA> 119034
4 <NA> NA NA 3140 <NA> 119034
5 <NA> NA NA 3140 <NA> 119034
6 <NA> NA NA 3140 <NA> 119034
7 <NA> NA NA 3140 <NA> 119034
8 <NA> NA NA 3140 <NA> 119034
9 <NA> NA NA 3140 <NA> 119034
10 <NA> NA NA 3140 <NA> 119034
11 <NA> NA NA 3140 <NA> 119034
12 <NA> NA NA 3140 <NA> 119034
13 <NA> NA NA 3140 <NA> 119034
14 <NA> NA NA 3140 <NA> 119034
15 <NA> NA NA 3140 <NA> 119034
16 <NA> NA NA 3140 <NA> 119034
17 <NA> NA NA 3140 <NA> 119034
18 <NA> NA NA 3140 <NA> 119034
19 <NA> NA NA 3140 <NA> 119034
20 <NA> NA NA 3140 <NA> 119034
valid_name kingdom phylum class order family
1 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
2 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
3 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
4 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
5 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
6 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
7 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
8 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
9 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
10 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
11 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
12 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
13 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
14 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
15 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
16 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
17 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
18 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
19 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
20 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
genus db_ScientificName db_CleanScientificName abTaxlevel a
1 Saduria <NA> <NA> order 0.1693124
2 Saduria <NA> <NA> order 0.1693124
3 Saduria <NA> <NA> order 0.1693124
4 Saduria <NA> <NA> order 0.1693124
5 Saduria <NA> <NA> order 0.1693124
6 Saduria <NA> <NA> order 0.1693124
7 Saduria <NA> <NA> order 0.1693124
8 Saduria <NA> <NA> order 0.1693124
9 Saduria <NA> <NA> order 0.1693124
10 Saduria <NA> <NA> order 0.1693124
11 Saduria <NA> <NA> order 0.1693124
12 Saduria <NA> <NA> order 0.1693124
13 Saduria <NA> <NA> order 0.1693124
14 Saduria <NA> <NA> order 0.1693124
15 Saduria <NA> <NA> order 0.1693124
16 Saduria <NA> <NA> order 0.1693124
17 Saduria <NA> <NA> order 0.1693124
18 Saduria <NA> <NA> order 0.1693124
19 Saduria <NA> <NA> order 0.1693124
20 Saduria <NA> <NA> order 0.1693124
b prey_weight_source prey_ind_weight
1 2.149667 observed 10.01
2 2.149667 observed 10.04
3 2.149667 observed 10.10
4 2.149667 observed 10.10
5 2.149667 observed 10.10
6 2.149667 observed 10.10
7 2.149667 observed 10.10
8 2.149667 observed 10.15
9 2.149667 observed 10.19
10 2.149667 observed 10.20
11 2.149667 observed 10.20
12 2.149667 observed 10.34
13 2.149667 observed 10.34
14 2.149667 observed 10.34
15 2.149667 observed 10.34
16 2.149667 observed 10.34
17 2.149667 observed 10.34
18 2.149667 observed 10.34
19 2.149667 observed 10.40
20 2.149667 observed 10.44
# Saduria has a mean weight of 1 g. max weight of 7,8 or 9. The count may be off etc. If analysing saduria individuals level, we need to revisit these values (https://doi.org/10.1111/j.0021-8790.2004.00800.x and https://www.jstor.org/stable/24831823 ). Not that outliers likely disappear when I filter on relative prey weight (<1).
# To control for outliers affecting average of individual weight, remove values outside quartiles
prey_avg_ind_weight <- new_db3 |>
filter(!is.na(AphiaIDPrey)) |>
group_by(AphiaIDPrey) |>
filter(!prey_ind_weight %in% boxplot.stats(prey_ind_weight)$out) |>
reframe(avg_weight = mean(prey_ind_weight, na.rm = TRUE)) |>
ungroup()filter: removed 13,929 rows (21%), 51,987 rows remaining
group_by: one grouping variable (AphiaIDPrey)
filter (grouped): removed 4,910 rows (9%), 47,077 rows remaining (removed 0 groups, 82 groups remaining)
ungroup: no grouping variables remain
prey_avg_ind_weight |> filter(is.na(avg_weight))filter: removed 79 rows (96%), 3 rows remaining
# A tibble: 3 × 2
AphiaIDPrey avg_weight
<dbl> <dbl>
1 104152 NaN
2 104241 NaN
3 234024 NaN
# This should be the average prey weight which we can use to calculate the weight of these prey if we have the counts. Left join that summarized data and do the estimate of weight based on length. But first figure out which unit prey size is
# Join average weight and estimate weight
new_db4 <- new_db3 |>
left_join(prey_avg_ind_weight) |>
mutate(prey_weight = if_else(prey_weight <= 0 & Count > 0 & !is.na(AphiaIDPrey), Count * avg_weight, prey_weight, missing = -9),
prey_weight = replace_na(prey_weight, 0)) # New name and remove NaNs created when estimating avg weight excluding outliers (mean() creates NaNs).Joining with `by = join_by(AphiaIDPrey)`
left_join: added one column (avg_weight)
> rows only in new_db3 13,929
> rows only in prey_avg_ind_weight ( 0)
> matched rows 51,987
> ========
> rows total 65,916
mutate: changed 56 values (<1%) of 'prey_weight' (0 new NAs)
# Those where prey weight is missing is duie to missing lengths or counts.
new_db4 |> filter(prey_weight <= 0) |> as.data.frame() |> distinct(prey_weight, prey_length, Count)filter: removed 52,600 rows (80%), 13,316 rows remaining
distinct: removed 13,310 rows (>99%), 6 rows remaining
prey_weight prey_length Count
1 -9 -9 -9
2 0 -9 1
3 0 -9 3
4 0 -9 2
5 -9 -9 1
6 -9 -9 3
# Remove these (nearly 12000 rows, which are mainly the empties but also 270 na count/weight/length in prey.
new_db4 <- new_db4 |>
filter(!(prey_weight <= 0 & Count <= 0 & prey_length <= 0),
!is.na(prey_weight)) # three rowsfilter: removed 13,310 rows (20%), 52,606 rows remaining
Summarise prey weights by predator and prey group
# Make prey groups based on taxonomy. Groups: herring, sprat, saduria, chordata, invertebrates, other
# clupeids are key prey and should be either herring or sprat
new_db4 |> # We split the non idd clupeids into sprat and herring in the next chunk.
filter(family == "Clupeidae") |> #%in% c("Clupea harengus", "Sprattus sprattus")) |>
summarise(n = n(), .by= c(valid_name))filter: removed 42,925 rows (82%), 9,681 rows remaining
summarise: now 4 rows and 2 columns, ungrouped
# A tibble: 4 × 2
valid_name n
<chr> <int>
1 Clupeidae 332
2 Sprattus sprattus 7000
3 Clupea harengus 2284
4 Clupea 65
# Saduria are also key prey, but...
new_db4 |> # No non idd Isopods
filter(order == "Isopoda") |> #%in% c("Clupea harengus", "Sprattus sprattus")) |>
summarise(n = n(), .by= c(valid_name))filter: removed 46,213 rows (88%), 6,393 rows remaining
summarise: now 3 rows and 2 columns, ungrouped
# A tibble: 3 × 2
valid_name n
<chr> <int>
1 Saduria entomon 6386
2 Idotea balthica 6
3 Isopoda 1
# make the groups
new_db5 <- new_db4 |> # arguments are evaluated in order so proceed from specific to general
mutate(prey_group = case_when(valid_name == "Clupea harengus"~ "herring",
valid_name == "Sprattus sprattus"~ "sprat",
valid_name == "Saduria entomon" ~ "saduria",
family == "Clupeidae" ~ "unidd_clupeids", # temporary, see below
#is.na(valid_name) ~ "other",
phylum == "Chordata"~ "other_chords",
kingdom == "Animalia" & !phylum == "Chordata" ~ "other_inverts",
.default = "other"))mutate: new variable 'prey_group' (character) with 7 unique values and 0% NA
# seems to work
new_db5 |>
summarise(n = n(), .by= prey_group)summarise: now 7 rows and 2 columns, ungrouped
# A tibble: 7 × 2
prey_group n
<chr> <int>
1 other_inverts 30273
2 saduria 6386
3 other_chords 5073
4 unidd_clupeids 397
5 sprat 7000
6 herring 2284
7 other 1193
new_db5 |> # only NA prey ids and Fucus in the other group
filter(prey_group == "other") |>
summarise(n = n(), .by = AphiaIDPrey)filter: removed 51,413 rows (98%), 1,193 rows remaining
summarise: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
AphiaIDPrey n
<dbl> <int>
1 144129 1
2 NA 1192
# The NAs
new_db5 |>
filter(is.na(prey_group))filter: removed all rows (100%)
# A tibble: 0 × 74
# ℹ 74 variables: tblUploadID <dbl>, tblHaulID <dbl>,
# tblPredatorInformationID <dbl>, Ship <chr>, Gear <chr>, HaulNo <dbl>,
# StationNumber <dbl>, Year <dbl>, Month <dbl>, Day <dbl>, Time <chr>,
# FishID <dbl>, AphiaIDPredator <dbl>, pred_weight <dbl>, Number <dbl>,
# MeasurementIncrement <dbl>, pred_length <dbl>, Code <chr>, Age <dbl>,
# Sex <chr>, MaturityScale <chr>, MaturityStage <dbl>,
# PreservationMethod <chr>, Regurgitated <dbl>, StomachFullness <lgl>, …
# only fish in other Chordates
new_db5 |>
filter(prey_group %in% c("other_chords")) |>
summarise(n= n(), .by = valid_name)filter: removed 47,533 rows (90%), 5,073 rows remaining
summarise: now 21 rows and 2 columns, ungrouped
# A tibble: 21 × 2
valid_name n
<chr> <int>
1 Pomatoschistus minutus 885
2 Cottus gobio 6
3 Gadus morhua 126
4 Ammodytes tobianus 18
5 Enchelyopus cimbrius 121
6 Pomatoschistus 66
7 Gasterosteus aculeatus 492
8 Osmerus eperlanus 52
9 Myoxocephalus quadricornis 1
10 Platichthys flesus 16
# ℹ 11 more rows
# only invert phyla
new_db5 |>
filter(prey_group %in% c("other_inverts")) |>
summarise(n= n(), .by = phylum)filter: removed 22,333 rows (42%), 30,273 rows remaining
summarise: now 6 rows and 2 columns, ungrouped
# A tibble: 6 × 2
phylum n
<chr> <int>
1 Arthropoda 24559
2 Annelida 5424
3 Priapulida 185
4 Mollusca 92
5 Nematoda 12
6 Ctenophora 1
With these estimates of weight based on either length and in a very few cases average weight of that prey, we calculate the total weight of these prey per individual predator stomach, and then pivot wider.
# make predator individual summary of weight
new_pg <- new_db5 |>
summarise(prey_tot_weight = sum(prey_weight), .by = c(tblPredatorInformationID, prey_group)) |>
filter(prey_tot_weight > 0)summarise: now 32,441 rows and 3 columns, ungrouped
filter: removed 4 rows (<1%), 32,437 rows remaining
# No NAs which seems good
sum(is.na(new_pg$prey_group))[1] 0
sum(is.na(new_pg$prey_tot_weight))[1] 0
# change to wide format to get predatorID unique rows.
new_pg <- new_pg |>
pivot_wider(names_from = "prey_group", values_from = "prey_tot_weight", values_fill = 0)pivot_wider: reorganized (prey_group, prey_tot_weight) into (other_inverts,
saduria, other_chords, unidd_clupeids, sprat, …) [was 32437x3, now 25740x8]
# no duplicates of predator ID.
new_pg |> group_by(tblPredatorInformationID) |> filter(n()>1)group_by: one grouping variable (tblPredatorInformationID)
filter (grouped): removed all rows (100%)
# A tibble: 0 × 8
# Groups: tblPredatorInformationID [0]
# ℹ 8 variables: tblPredatorInformationID <dbl>, other_inverts <dbl>,
# saduria <dbl>, other_chords <dbl>, unidd_clupeids <dbl>, sprat <dbl>,
# herring <dbl>, other <dbl>
# Split unidentified clupeids on sprat and herring
new_pg <- new_pg |>
mutate(sprat = sprat + unidd_clupeids*0.5,
herring = herring + unidd_clupeids*0.5) |>
dplyr::select(!unidd_clupeids)mutate: changed 332 values (1%) of 'sprat' (0 new NAs)
changed 332 values (1%) of 'herring' (0 new NAs)
Next I will left_join in the remaining predator information, and after that bind_rows “empty stomachs” (with respect to these 3 prey species). Since the IDs are not overlapping, it doesn’t matter that I already have some 0’s here for some species
new_pg_pred <- new_pg |>
left_join(pred |> rename(pred_length = Length,
pred_weight = IndWgt), by = "tblPredatorInformationID")rename: renamed 2 variables (pred_weight, pred_length)
left_join: added 40 columns (tblUploadID, tblHaulID, Ship, Gear, HaulNo, …)
> rows only in new_pg 0
> rows only in rename(pred, pred_lengt.. (13,537)
> matched rows 25,740
> ========
> rows total 25,740
# two empty stomachs
new_pg_pred |>
rowwise() |>
filter(sum(saduria, other_inverts, other_chords, sprat, herring, other) == 0) |>
ungroup()filter: removed all rows (100%)
ungroup: no grouping variables remain
# A tibble: 0 × 47
# ℹ 47 variables: tblPredatorInformationID <dbl>, other_inverts <dbl>,
# saduria <dbl>, other_chords <dbl>, sprat <dbl>, herring <dbl>, other <dbl>,
# tblUploadID <dbl>, tblHaulID <dbl>, Ship <chr>, Gear <chr>, HaulNo <dbl>,
# StationNumber <dbl>, Year <dbl>, Month <dbl>, Day <dbl>, Time <chr>,
# FishID <dbl>, AphiaIDPredator <dbl>, pred_weight <dbl>, Number <dbl>,
# MeasurementIncrement <dbl>, pred_length <dbl>, Code <chr>, Age <dbl>,
# Sex <chr>, MaturityScale <chr>, MaturityStage <dbl>, …
Now add in the “empty stomachs” from pred3 using bind_rows. When I bind_rows, the columns that are not matching get NA. The only column not matching should be the average weight columns. They will get NA, and I’ll change it to 0.
Bind in empty stomachs
# The empty stomachs to be added are those that are empty in predator data. For data before 2005 we don't know whether they are regurgitated or empty because of not eating. For an analysis using all data (1963-) we need to keep that in mind.
# Because of the issue of not getting prey weight even though they are present above, we need to make sure to drop these stomachs in the full data set also before joining, so that we don't treat them as empty!
# StomachEmpty = 1 is an empty stomach even though this is not an mandatory parameter (?). Since regurgitated==1 and NA have been removed, all empty stomachs seem to be truly empty, i.e. there are no tblpredatorinformationIDs with stomach empty that are in the prey data (No overlap in stomachs between the predator and prey data sets).
empty_stom <- pred3 |>
filter(StomachEmpty == 1) |>
filter(!tblPredatorInformationID %in% unique(new_pg$tblPredatorInformationID))filter: removed 28,414 rows (74%), 9,960 rows remaining
filter: no rows removed
# Bind rows!
new_dat <- new_pg_pred |>
bind_rows(empty_stom)
# Added NAs with empty stomachs, replace with 0
new_dat2 <- new_dat |>
mutate(other = replace_na(other, 0),
sprat = replace_na(sprat, 0),
herring = replace_na(herring, 0),
saduria = replace_na(saduria, 0),
other_inverts = replace_na(other_inverts, 0),
other_chords = replace_na(other_chords, 0))mutate: changed 9,960 values (28%) of 'other_inverts' (9,960 fewer NAs)
changed 9,960 values (28%) of 'saduria' (9,960 fewer NAs)
changed 9,960 values (28%) of 'other_chords' (9,960 fewer NAs)
changed 9,960 values (28%) of 'sprat' (9,960 fewer NAs)
changed 9,960 values (28%) of 'herring' (9,960 fewer NAs)
changed 9,960 values (28%) of 'other' (9,960 fewer NAs)
# Fix that Denmark report in kg
new_dat2 |>
ggplot(aes(pred_weight, fill = Country)) +
facet_wrap(~Country, scales = "free") +
geom_histogram() +
guides( fill = "none")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 687 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
new_dat2 <- new_dat2 |>
mutate(pred_weight = ifelse(Country %in% c("DK"), pred_weight*1000, pred_weight))mutate: changed 2,286 values (6%) of 'pred_weight' (0 new NAs)
# there are negative pred weights from LV in the 1980s. We deal with them when we estimate weights (part 7.). IF we remove them, we see that cods in the data was small in the 1980.
new_dat2 |>
filter(pred_weight > 0) |>
mutate(decade = round(Year/10) * 10) |>
summarise(mean_predw = mean(pred_weight, na.rm = TRUE), n = n(), .by = c(decade, Country))filter: removed 879 rows (2%), 34,821 rows remaining
mutate: new variable 'decade' (double) with 7 unique values and 0% NA
summarise: now 11 rows and 4 columns, ungrouped
# A tibble: 11 × 4
decade Country mean_predw n
<dbl> <chr> <dbl> <int>
1 1970 LV 281. 9112
2 1980 LV 12.4 4183
3 1960 LV 470. 7042
4 1990 LV 1076. 537
5 2000 LV 501. 5652
6 2010 LV 404. 4653
7 2020 SE 298. 478
8 2020 PL 408. 654
9 2010 DK 635. 494
10 2020 DK 486. 1792
11 2020 DE 162. 224
# but their distribution seems good.
new_dat2 |>
filter(Year > 1979 & Year < 1990 & Country == "LV",
pred_weight > 0) |>
ggplot(aes(pred_weight)) +
geom_histogram() +
guides( fill = "none") +
labs(title = "LV 1980s")filter: removed 33,930 rows (95%), 1,770 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
# Also, note that most historical data in the NDB are from LV
new_dat2 |>
mutate(decade = round(Year/10) * 10) |>
ggplot(aes(pred_weight, color = factor(decade))) +
facet_wrap(~Country, scales = "free") +
geom_freqpoly(binwidth=100) +
guides( fill = "none")mutate: new variable 'decade' (double) with 7 unique values and 0% NA
Warning: Removed 687 rows containing non-finite outside the scale range (`stat_bin()`).
The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
# Sweden report in mm
new_dat2 |>
ggplot(aes(pred_length, fill = Country)) +
facet_wrap(~Country, scales = "free") +
geom_histogram() +
guides( fill = "none")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
new_dat2 <- new_dat2 |>
mutate(pred_length = ifelse(Country == "SE", pred_length/10, pred_length))mutate: changed 478 values (1%) of 'pred_length' (0 new NAs)
# Remove non cod predators (e.g. Whiting 126438)
new_data <- new_dat2 |> filter(AphiaIDPredator %in% c(126436))filter: removed 224 rows (1%), 35,476 rows remaining
new_data |>
summarise(count = n(), .by = c(Year, Country)) |>
ggplot() +
geom_bar(aes(Year, count, fill = Country), alpha = 1, stat="identity", position = "stack") +
scale_x_continuous(n.breaks = 10) + ylab("# stomachs") +
labs(title="New data base")summarise: now 50 rows and 3 columns, ungrouped
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
4. Old database data
The data in the old ICES database contains much more data than what is found in the new version of the database but is in poor shape containing many errors and unique identiifers are missing. Below, identifiers for hauls and predators are made and the old db is cleaned.
Make unique identifier
glimpse(ODB)Rows: 574,684
Columns: 52
$ Dataset <chr> "Stomach data: Latvian historic", "Stomach…
$ RecordType <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Country <chr> "Latvia", "Latvia", "Latvia", "Latvia", "L…
$ Ship <chr> "MAZ", "UNKN", "UNKN", "UNKN", "UNKN", "ZB…
$ Latitude <dbl> 54.25, 54.25, 54.25, 54.25, 54.25, 54.25, …
$ Longitude <dbl> 14.5, 15.5, 15.5, 15.5, 15.5, 15.5, 15.5, …
$ Estimated_Lat_Lon <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ ICES_StatRec <chr> "37G4", "37G5", "37G5", "37G5", "37G5", "3…
$ ICES_AreaCode <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Year <dbl> 1968, 1973, 1973, 1973, 1973, 1983, 1983, …
$ Month <chr> "9", "5", "5", "5", "5", "12", "12", "1", …
$ Day <chr> "19", "11", "11", "11", "11", "16", "16", …
$ Time <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Station <chr> "1", "15", "15", "15", "15", "7", "7", "30…
$ Haul <chr> "1", "15", "15", "15", "15", "7", "7", "30…
$ Sampling_Method <chr> "Demersal sampling", "Demersal sampling", …
$ Depth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Temperature <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `SampleNo(FishID)` <dbl> 52, 100, 69, 74, 97, 4, 61, 11, 18, 26, 30…
$ ICES_SampleID <dbl> 2628171, 2528147, 2528139, 2528140, 252814…
$ Predator_AphiaID <dbl> 126436, 126436, 126436, 126436, 126436, 12…
$ Predator_LatinName <chr> "Gadus morhua", "Gadus morhua", "Gadus mor…
$ `Predator_Weight(mean)` <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `Predator_Age(mean)` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `Predator_Lengh(mean)` <dbl> 25, 17, 17, 18, 16, 60, 77, 18, 25, 19, 19…
$ Predator_LowerLengthBound <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Predator_UpperLengthBound <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Predator_CPUE <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `GallBladder_stage(class)` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_METFP <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_TotalNo <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_WithFood <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Stomach_Regurgitated <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Stomach_WithSkeletalRemains <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Stomach_Empty <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ Stomach_ContentWgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_EmptyWgt <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_fullness <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_Item <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ ICES_ItemID <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_AphiaID <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_LatinName <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_IdentMet <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_DigestionStage <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_TotalNo <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_Weight <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_LengthIdentifier <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_Length <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_LowerLengthBound <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_UpperLengthBound <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_MinNo <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Remarks <chr> "Reported Stomach_Item: ; Repor", "Reporte…
# Each row in old data contains a prey item, i.e. we have many rows for each predator and there is no unique predator ID. We need to generate an identifier so that we can compare observations the old and new data and remove overlapping data. The ICESsampleID is for the predator I assume, ICESitemID are prey items but there are duplicates, possibly mainly due to the two data sets (Year of stomach and Stomach tender using the same IDs which become duplicates?).
# By creating an ID with Date (Year, Month and Day), Country and Haul (HaulNo (new data) and Haul (old data)) we can identify the data missing from the new database. Ship info in the old data is unclean, by instead assuming that Country should be unique to date and haul (sort of Country as a proxy for Ship). However, there are 41506 rows where country and ship is missing. VT removes these.
#length(is.na(old_db[which(is.na(old_db$Country)),]$Ship))
old_db <- janitor::remove_empty(ODB, which = "cols") # remove cols with only Nas to increase readability
# fix country names so that they equal the other datasets
unique(old_db$Country) [1] "Latvia" "Denmark" "Poland" "Sweden"
[5] NA "France" "Germany" "Norway"
[9] "The Netherlands" "United Kingdom" "Belgium" "Ireland"
old_db <- old_db |>
mutate(Country = case_when(Country == "Latvia" ~ "LV",
Country == "Denmark" ~ "DK",
Country == "Poland" ~ "PL",
Country == "Sweden" ~ "SE",
Country == "Germany" ~ "DE",
is.na(Country) ~ NA,
.default = "NotBaltic"))mutate: changed 533,178 values (93%) of 'Country' (0 new NAs)
old_db %>%
filter(Country %in% c("NotBaltic",NA)) %>%
count(Country)filter: removed 392,826 rows (68%), 181,858 rows remaining
count: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
Country n
<chr> <int>
1 NotBaltic 140352
2 <NA> 41506
sum(is.na(old_db$Haul))[1] 50317
sum(is.na(old_db$ICES_StatRec))[1] 0
# we get rid of almost all NAs (198 are left) in Haul by excluding based on lat lon (i.e. outside the Baltic making Country not necessary)
# filter and create haul identifier
old_db <- old_db |>
rename(HaulNo = Haul,
ICESrectangle = ICES_StatRec) |>
filter(# remove non cods
Predator_AphiaID %in% c(126436),
# remove data outside the Baltic (this removed all Country == NA)
between(Latitude, 53, 60) & between(Longitude, 12.5, 24)) |>
mutate(Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_"))rename: renamed 2 variables (ICESrectangle, HaulNo)
filter: removed 207,088 rows (36%), 367,596 rows remaining
mutate: new variable 'Haul_ID' (character) with 2,649 unique values and 0% NA
old_db %>%
filter(Country %in% c("NotBaltic",NA)) %>%
count(Country)filter: removed all rows (100%)
count: now 0 rows and 2 columns, ungrouped
# A tibble: 0 × 2
# ℹ 2 variables: Country <chr>, n <int>
# join hi and fi from new db to get columns for identifier
# add Country and ICESrectangle to new db prey data from haul data and create identifier
prey_temp <- prey |>
left_join(dplyr::select(hifi, tblUploadID, Country, ICESrectangle), by = "tblUploadID", multiple = "any") |>
mutate(Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_"))left_join: added 2 columns (Country, ICESrectangle)
> rows only in prey 0
> rows only in dplyr::select(hifi, tbl.. ( 59)
> matched rows 54,235
> ========
> rows total 54,235
mutate: new variable 'Haul_ID' (character) with 1,589 unique values and 0% NA
# identify cod in hauls in the Baltic not present in the new data. I.e. what the old db is adding to the new db. For the January 2024 ICES db download, this removes 23 487 rows.
missing_db <- old_db |>
anti_join(prey_temp, by = "Haul_ID") # return all rows from x (old_d) without a match in y (prey)anti_join: added no columns
> rows only in old_db 344,109
> rows only in prey_temp ( 48,707)
> matched rows ( 23,487)
> =========
> rows total 344,109
Correct coordinates where possible
Below we update ICES rectangle center based coordinates in the ODB with coordinates from matching BITS surveys.
# # few data points have estimated the coordinates according to Estimated_Lat_Lon
sum(old_db$Estimated_Lat_Lon == "Yes", na.rm = TRUE)/nrow(old_db) [1] 0.0001904264
missing_db |> # But its likely all ODB data as the lat lon of the rectangle midpoint equals the lat lon columns
filter(ices.rect(ICESrectangle)$lon == Longitude & ices.rect(ICESrectangle)$lat == Latitude) |>
count()filter: no rows removed
count: now one row and one column, ungrouped
# A tibble: 1 × 1
n
<int>
1 344109
length(unique(missing_db$Longitude)) # only 11 unique lats [1] 11
missing_db |> # and 60 unique coords in the old db
dplyr::select(Latitude, Longitude) |>
distinct()distinct: removed 344,049 rows (>99%), 60 rows remaining
# A tibble: 60 × 2
Latitude Longitude
<dbl> <dbl>
1 54.2 14.5
2 54.2 15.5
3 54.2 19.5
4 54.8 14.5
5 54.8 15.5
6 54.8 19.5
7 55.2 15.5
8 55.2 16.5
9 55.2 17.5
10 55.2 18.5
# ℹ 50 more rows
# Make an identifier
intersect(tolower(colnames(missing_db)), tolower(colnames(bits_hh)))[1] "country" "ship" "year" "month" "day" "haulno" "depth"
sum(is.na(missing_db$HaulNo)) # a few rows lacks haul number.[1] 198
missing_db2 <- missing_db |>
mutate(Hid = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_")) #mutate: new variable 'Hid' (character) with 2,441 unique values and 0% NA
bits_hh <- bits_hh |>
mutate(Hid = paste(Country, Year, Month, Day, HaulNo, StatRec, sep = "_"))mutate: new variable 'Hid' (character) with 18,105 unique values and 0% NA
length(intersect(missing_db2$Hid, bits_hh$Hid) ) # 553 unique hauls that are matching based on Country, Year, Month, Day, ices_rect[1] 553
missing_db2 |> # We should have done better in the current millenium...
filter(!Hid %in% unique(bits_hh$Hid)) |>
mutate(decade = round(Year/10) * 10) |>
group_by(decade, Country) |>
count() |>
ungroup()filter: removed 29,670 rows (9%), 314,439 rows remaining
mutate: new variable 'decade' (double) with 6 unique values and 0% NA
group_by: 2 grouping variables (decade, Country)
count: now 10 rows and 3 columns, 2 group variables remaining (decade, Country)
ungroup: no grouping variables remain
# A tibble: 10 × 3
decade Country n
<dbl> <chr> <int>
1 1960 LV 28780
2 1970 LV 102072
3 1980 LV 120830
4 1990 DK 70
5 1990 LV 45164
6 2000 LV 85
7 2010 DK 11708
8 2010 LV 147
9 2010 PL 2798
10 2010 SE 2785
# Now I add lat/lon values from bits_hh to replace those in mm
missing_db3 <- missing_db2 |>
left_join(bits_hh |> dplyr::select(ShootLat, ShootLong, Hid)) |>
mutate(lat = ifelse(is.na(ShootLat), Latitude, ShootLat),
lon = ifelse(is.na(ShootLong), Longitude, ShootLong))Joining with `by = join_by(Hid)`
left_join: added 2 columns (ShootLat, ShootLong)
> rows only in missing_db2 314,439
> rows only in dplyr::select(bits_hh, .. ( 17,552)
> matched rows 29,670
> =========
> rows total 344,109
mutate: new variable 'lat' (double) with 356 unique values and 0% NA
new variable 'lon' (double) with 372 unique values and 0% NA
# now there is 581 unique coordinates instead of 11
missing_db3 |>
dplyr::select(lat, lon) |>
distinct()distinct: removed 343,528 rows (>99%), 581 rows remaining
# A tibble: 581 × 2
lat lon
<dbl> <dbl>
1 54.2 14.5
2 54.2 15.5
3 54.2 19.5
4 54.8 14.5
5 54.8 15.5
6 54.8 19.5
7 55.2 15.5
8 55.2 16.5
9 55.2 17.5
10 55.2 18.5
# ℹ 571 more rows
Fix prey weights
Fix prey lengths, counts and estimate weights based on taxonomic a and b.
missing_db3 |> # 819 prey weights out of 71 000 should be possible to fix
dplyr::select(Prey_LowerLengthBound, Prey_UpperLengthBound, Prey_Weight, Prey_TotalNo) |>
filter(is.na(Prey_Weight) | Prey_Weight == 0) |>
filter((Prey_LowerLengthBound > 0 | Prey_UpperLengthBound > 0))filter: removed 273,152 rows (79%), 70,957 rows remaining
filter: removed 70,138 rows (99%), 819 rows remaining
# A tibble: 819 × 4
Prey_LowerLengthBound Prey_UpperLengthBound Prey_Weight Prey_TotalNo
<dbl> <dbl> <dbl> <dbl>
1 125 NA NA 1
2 8 NA NA -999
3 8 NA 0 -999
4 1.5 NA 0 -999
5 1.5 NA NA -999
6 3.9 NA 0 1
7 6.3 NA 0 1
8 3.7 NA 0 1
9 4.2 NA 0 1
10 5.5 NA 0 1
# ℹ 809 more rows
# fix prey lengths and weights. Estimate weight from length and count. If count is 0 or NA when length > 0, assume count=1.
missing_db3 |> # 70 957 zeros out of which many are NA
filter(Prey_Weight > 0) |>
summarise(n()) filter: removed 70,957 rows (21%), 273,152 rows remaining
summarise: now one row and one column, ungrouped
# A tibble: 1 × 1
`n()`
<int>
1 273152
# rename and fix count, weight and lengths
missing_db4 <- missing_db3 |>
rename(prey_count = Prey_TotalNo,
prey_weight = Prey_Weight) |>
mutate(Prey_UpperLengthBound = replace_na(Prey_UpperLengthBound, 0),
Prey_LowerLengthBound = replace_na(Prey_LowerLengthBound, 0),
prey_length = if_else(Prey_UpperLengthBound == 0, Prey_LowerLengthBound,
(Prey_UpperLengthBound + Prey_LowerLengthBound)/2,
missing = NA),# there were few Upper (233)
prey_count = replace_na(prey_count, 0),
prey_weight = replace_na(prey_weight, 0),
prey_weight_source = if_else(prey_weight > 0, "observed",
if_else(prey_count > 0 & prey_length > 0, "estimated",
if_else(prey_count == 0 & prey_length > 0, "estimated_zerocount",
NA, missing = NA),
missing = NA)))rename: renamed 2 variables (prey_count, prey_weight)
mutate: changed 80,919 values (24%) of 'prey_count' (80,919 fewer NAs)
changed 36,390 values (11%) of 'prey_weight' (36,390 fewer NAs)
changed 125,882 values (37%) of 'Prey_LowerLengthBound' (125,882 fewer NAs)
changed 344,070 values (>99%) of 'Prey_UpperLengthBound' (344,070 fewer NAs)
new variable 'prey_length' (double) with 348 unique values and 0% NA
new variable 'prey_weight_source' (character) with 4 unique values and 20% NA
# join in taxonomy and a and b
old_db_preyTaxamatch_LW <- preyTaxamatch_LW |>
dplyr::select("valid_AphiaID","valid_name","kingdom","phylum","class","order","family","genus","db_AphiaID","db_ScientificName","db_CleanScientificName","abTaxlevel","a","b") |>
mutate(abTaxlevel = factor(abTaxlevel, Taxonomic_hierarchy)) |>
arrange(db_ScientificName, abTaxlevel) |> # here we use db_sciname and not AID
distinct(db_ScientificName, .keep_all = TRUE)mutate: converted 'abTaxlevel' from character to factor (0 new NA)
distinct: removed 81 rows (13%), 523 rows remaining
# Before joining in as and bs and taxonomy, we first fix NA prey latin name using Stomach_Item where possible when latin name is Na and prey weight or length is positive (10 276)
missing_db4 |>
filter(is.na(Prey_LatinName)) |>
filter(prey_length > 0 | prey_weight > 0) |>
summarise(fix = unique(Stomach_Item))filter: removed 265,122 rows (77%), 78,987 rows remaining
filter: removed 68,711 rows (87%), 10,276 rows remaining
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
ℹ The deprecated feature was likely used in the tidylog package.
Please report the issue at <https://github.com/elbersb/tidylog/issues>.
summarise: now 64 rows and one column, ungrouped
# A tibble: 64 × 1
fix
<chr>
1 Pontoporeia
2 Unidentified fish
3 Fish eggs
4 Algae
5 Insect
6 Clupeidae scales
7 Unidentified algae covered wit
8 Unidentified mass
9 Unidentified remains
10 Fucus
# ℹ 54 more rows
# where we lack latin name and there is a stomach item, use stomach item as latin name
missing_db5 <- missing_db4 |>
mutate(Prey_LatinName = if_else(!is.na(Prey_LatinName) & !is.na(Stomach_Item), Prey_LatinName, Stomach_Item))mutate: changed 10,663 values (3%) of 'Prey_LatinName' (9,933 fewer NAs)
# now join in as, bs and taxonomy
# missing_db5 |> filter(is.na(Prey_LatinName)) |> # No AID without latinnanme
# distinct(Prey_AphiaID)
missing_db6 <- missing_db5 |> # note necessary argument na_matches
left_join(old_db_preyTaxamatch_LW, join_by("Prey_LatinName"=="db_ScientificName"), na_matches = "never")left_join: added 13 columns (valid_AphiaID, valid_name, kingdom, phylum, class, …)
> rows only in missing_db5 276
> rows only in old_db_preyTaxamatch_LW ( 357)
> matched rows 343,833
> =========
> rows total 344,109
# no a and b missing where we can fix prey weights
missing_db6 |> filter(is.na(a) & prey_weight == 0 & prey_length != 0) |>
distinct(Prey_AphiaID, Prey_LatinName, .keep_all = TRUE)filter: removed all rows (100%)
distinct: no rows removed
# A tibble: 0 × 58
# ℹ 58 variables: Dataset <chr>, Country <chr>, Ship <chr>, Latitude <dbl>,
# Longitude <dbl>, Estimated_Lat_Lon <chr>, ICESrectangle <chr>, Year <dbl>,
# Month <chr>, Day <chr>, Time <chr>, Station <chr>, HaulNo <chr>,
# Sampling_Method <chr>, Depth <dbl>, Temperature <dbl>,
# SampleNo(FishID) <dbl>, ICES_SampleID <dbl>, Predator_AphiaID <dbl>,
# Predator_LatinName <chr>, Predator_Weight(mean) <dbl>,
# Predator_Lengh(mean) <dbl>, Predator_CPUE <dbl>, Stomach_WithFood <dbl>, …
# Estimate weights based on length
missing_db7 <- missing_db6 |>
mutate(prey_weight = if_else(is.na(a) & is.na(b),
case_when(prey_weight_source == "estimated_zerocount" ~ (0.01*prey_length^3)*1,
prey_weight_source == "estimated" ~ (0.01*prey_length^3)*prey_count,
.default = prey_weight),
case_when(prey_weight_source == "estimated_zerocount" ~ (a*prey_length^b)*1,
prey_weight_source == "estimated" ~ (a*prey_length^b)*prey_count,
.default = prey_weight)),
prey_weight_ind = if_else(prey_count > 0 & prey_weight > 0, prey_weight / prey_count, NA))mutate: changed 763 values (<1%) of 'prey_weight' (0 new NAs)
new variable 'prey_weight_ind' (double) with 13,623 unique values and 51% NA
table(missing_db7$prey_weight_source, useNA = "ifany") # 70194 zeros where we cant estimate weight (equal to initial calculations at top of chunk)
estimated estimated_zerocount observed <NA>
578 185 273152 70194
# What to do with 70 000 missing weights. We can give them average weight like for the new db.
missing_db7 |> # where there is a prey species IDd, there are only ~1000 zero weights that we can fix. Here, I assume count Na/0 is zero.
filter(!is.na(Prey_AphiaID) & prey_weight == 0 & prey_count >= 0)filter: removed 343,047 rows (>99%), 1,062 rows remaining
# A tibble: 1,062 × 59
Dataset Country Ship Latitude Longitude Estimated_Lat_Lon ICESrectangle
<chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
1 Stomach dat… DK DAN2 55.2 15.5 <NA> 39G5
2 Stomach dat… PL BAL 55.2 16.5 <NA> 39G6
3 Stomach dat… PL BAL 55.2 16.5 <NA> 39G6
4 Stomach dat… DK DAN2 55.2 15.5 <NA> 39G5
5 Stomach dat… LV MAZ 54.8 15.5 <NA> 38G5
6 Stomach dat… DK DAN2 55.8 15.5 <NA> 40G5
7 Stomach dat… LV CLV 57.8 22.5 <NA> 44H2
8 Stomach dat… LV CLV 56.2 20.5 <NA> 41H0
9 Stomach dat… LV CLV 56.2 20.5 <NA> 41H0
10 Stomach dat… LV CLV 55.8 20.5 <NA> 40H0
# ℹ 1,052 more rows
# ℹ 52 more variables: Year <dbl>, Month <chr>, Day <chr>, Time <chr>,
# Station <chr>, HaulNo <chr>, Sampling_Method <chr>, Depth <dbl>,
# Temperature <dbl>, `SampleNo(FishID)` <dbl>, ICES_SampleID <dbl>,
# Predator_AphiaID <dbl>, Predator_LatinName <chr>,
# `Predator_Weight(mean)` <dbl>, `Predator_Lengh(mean)` <dbl>,
# Predator_CPUE <dbl>, Stomach_WithFood <dbl>, Stomach_Regurgitated <dbl>, …
# To control for outliers affecting average of individual weight, remove outliers outside quartiles
prey_avg_ind_weight_odb <- missing_db7 |> #prey_avg_ind_weight_odb |>
filter(!is.na(Prey_AphiaID)) |>
filter(!prey_weight_ind %in% boxplot.stats(prey_weight_ind)$out, .by = Prey_AphiaID) |>
reframe(avg_weight = mean(prey_weight_ind, na.rm = TRUE), .by = Prey_AphiaID)filter: removed 79,498 rows (23%), 264,611 rows remaining
filter: removed 18,107 rows (7%), 246,504 rows remaining
# NaNs are produced despite na.rm=TRUE and nas in Prey_AphiaID are removed
prey_avg_ind_weight_odb |>
filter(is.na(avg_weight))filter: removed 114 rows (92%), 10 rows remaining
# A tibble: 10 × 2
Prey_AphiaID avg_weight
<dbl> <dbl>
1 101361 NaN
2 1066 NaN
3 1080 NaN
4 120020 NaN
5 120178 NaN
6 126508 NaN
7 127143 NaN
8 127196 NaN
9 1307 NaN
10 134958 NaN
#set the NaNs to 0
prey_avg_ind_weight_odb <- prey_avg_ind_weight_odb |>
mutate(avg_weight = replace_na(avg_weight, 0))mutate: changed 10 values (8%) of 'avg_weight' (10 fewer NAs)
# Join average weight and estimate weight
missing_db8 <- missing_db7 |>
left_join(prey_avg_ind_weight_odb) |>
mutate(prey_weight = if_else(prey_weight <= 0 & prey_count > 0 & !is.na(Prey_AphiaID), prey_count * avg_weight, prey_weight))Joining with `by = join_by(Prey_AphiaID)`
left_join: added one column (avg_weight)
> rows only in missing_db7 79,498
> rows only in prey_avg_ind_weight_odb ( 0)
> matched rows 264,611
> =========
> rows total 344,109
mutate: changed 569 values (<1%) of 'prey_weight' (0 new NAs)
missing_db8 |> # 69 621 zeros out of which many come from NAs
filter(prey_weight == 0)filter: removed 274,484 rows (80%), 69,625 rows remaining
# A tibble: 69,625 × 60
Dataset Country Ship Latitude Longitude Estimated_Lat_Lon ICESrectangle
<chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
1 Stomach dat… LV MAZ 54.2 14.5 <NA> 37G4
2 Stomach dat… LV UNKN 54.2 15.5 <NA> 37G5
3 Stomach dat… LV UNKN 54.2 15.5 <NA> 37G5
4 Stomach dat… LV UNKN 54.2 15.5 <NA> 37G5
5 Stomach dat… LV UNKN 54.2 15.5 <NA> 37G5
6 Stomach dat… LV ZBA 54.2 15.5 <NA> 37G5
7 Stomach dat… LV ZBA 54.2 15.5 <NA> 37G5
8 Stomach dat… LV MAZ 54.2 19.5 <NA> 37G9
9 Stomach dat… LV MAZ 54.2 19.5 <NA> 37G9
10 Stomach dat… LV MAZ 54.2 19.5 <NA> 37G9
# ℹ 69,615 more rows
# ℹ 53 more variables: Year <dbl>, Month <chr>, Day <chr>, Time <chr>,
# Station <chr>, HaulNo <chr>, Sampling_Method <chr>, Depth <dbl>,
# Temperature <dbl>, `SampleNo(FishID)` <dbl>, ICES_SampleID <dbl>,
# Predator_AphiaID <dbl>, Predator_LatinName <chr>,
# `Predator_Weight(mean)` <dbl>, `Predator_Lengh(mean)` <dbl>,
# Predator_CPUE <dbl>, Stomach_WithFood <dbl>, Stomach_Regurgitated <dbl>, …
Make prey groups ODB
# Make prey groups
missing_db9 <- missing_db8 |> # arguments are evaluated in order so proceed from specific to general
mutate(prey_group = case_when(valid_name == "Clupea harengus"~ "herring",
valid_name == "Sprattus sprattus"~ "sprat",
valid_name == "Saduria entomon" ~ "saduria",
family == "Clupeidae" ~ "unidd_clupeids", # temporary, see below
#is.na(valid_name) ~ "other",
phylum == "Chordata"~ "other_chords",
kingdom == "Animalia" & !phylum == "Chordata" ~ "other_inverts",
.default = "other"))mutate: new variable 'prey_group' (character) with 7 unique values and 0% NA
# seems to work
missing_db9 |>
summarise(n = n(), .by= prey_group)summarise: now 7 rows and 2 columns, ungrouped
# A tibble: 7 × 2
prey_group n
<chr> <int>
1 other 69334
2 other_inverts 120172
3 other_chords 14160
4 unidd_clupeids 1601
5 herring 16601
6 sprat 46381
7 saduria 75860
missing_db9 |> # only NA prey ids and Fucus in the other group. We set all non bio to NA in the get taxonomy chunk.
filter(prey_group == "other") |>
summarise(n = n(), .by = db_CleanScientificName)filter: removed 274,775 rows (80%), 69,334 rows remaining
summarise: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
db_CleanScientificName n
<chr> <int>
1 <NA> 69330
2 Fucus 4
# The NAs
missing_db9 |>
filter(is.na(prey_group))filter: removed all rows (100%)
# A tibble: 0 × 61
# ℹ 61 variables: Dataset <chr>, Country <chr>, Ship <chr>, Latitude <dbl>,
# Longitude <dbl>, Estimated_Lat_Lon <chr>, ICESrectangle <chr>, Year <dbl>,
# Month <chr>, Day <chr>, Time <chr>, Station <chr>, HaulNo <chr>,
# Sampling_Method <chr>, Depth <dbl>, Temperature <dbl>,
# SampleNo(FishID) <dbl>, ICES_SampleID <dbl>, Predator_AphiaID <dbl>,
# Predator_LatinName <chr>, Predator_Weight(mean) <dbl>,
# Predator_Lengh(mean) <dbl>, Predator_CPUE <dbl>, Stomach_WithFood <dbl>, …
# only fish and a few Gnathostomata in other Chordates
missing_db9 |>
filter(prey_group %in% c("other_chords")) |>
summarise(n= n(), .by = valid_name)filter: removed 329,949 rows (96%), 14,160 rows remaining
summarise: now 50 rows and 2 columns, ungrouped
# A tibble: 50 × 2
valid_name n
<chr> <int>
1 Osteichthyes 1299
2 Gadus morhua 1215
3 Trachurus 5
4 Enchelyopus cimbrius 778
5 Pleuronectidae 69
6 Merlangius merlangus 3
7 Gasterosteoidei 277
8 Gasterosteus aculeatus 172
9 Pungitius pungitius 1
10 Callionymus 34
# ℹ 40 more rows
# only invert phyla
missing_db9 |>
filter(prey_group %in% c("other_inverts")) |>
summarise(n= n(), .by = phylum)filter: removed 223,937 rows (65%), 120,172 rows remaining
summarise: now 7 rows and 2 columns, ungrouped
# A tibble: 7 × 2
phylum n
<chr> <int>
1 Arthropoda 93700
2 Annelida 24796
3 Mollusca 708
4 Priapulida 929
5 Echinodermata 7
6 Rotifera 14
7 Cnidaria 18
# Perfect apart from that in the other group there are some animal prey that is classified as animalia: "Zooplancton","Unidentified invertebrata","Unidentified worm". We choose to live with having these as "other" and not as invertebrates.
missing_db9 |>
filter(prey_group == "other",
prey_weight > 0) |>
summarise(n = n(), .by = Prey_LatinName)filter: removed 343,797 rows (>99%), 312 rows remaining
summarise: now 23 rows and 2 columns, ungrouped
# A tibble: 23 × 2
Prey_LatinName n
<chr> <int>
1 Algae 61
2 Unidentified algae covered wit 2
3 Unidentified mass 9
4 Unidentified remains 4
5 Fucus 4
6 Stone 109
7 Spawn 2
8 Waste 12
9 Siphon 4
10 Plastic 24
# ℹ 13 more rows
# Duplicates?
dup <- missing_db9 |>
group_by(Haul_ID) |>
dplyr::select(-Dataset, -ICES_SampleID) |>
duplicated() |>
which() group_by: one grouping variable (Haul_ID)
# remove duplicates
missing_db9 <- missing_db9[-dup,]Sum prey group weights per predator
# remove zero weight predators and prey that are to no use, then make predator identifier. Remove NA prey weights
missing_db10 <- missing_db9 |>
rename(pred_weight = `Predator_Weight(mean)`,
pred_length = `Predator_Lengh(mean)`) |>
filter( pred_weight > 0 | pred_length > 0) |>
mutate(pred_ID = paste(Haul_ID, `SampleNo(FishID)`, pred_weight, pred_length, sep = "_")) # both weight and length needed to get all fish a unique ID.rename: renamed 2 variables (pred_weight, pred_length)
filter: removed 82 rows (<1%), 343,881 rows remaining
mutate: new variable 'pred_ID' (character) with 97,412 unique values and 0% NA
# In the old db (which is not the case for the new db), we have both pred and prey data. We can thus use mutate() and distinct() instead of summarise().
missing_db10_sum <- missing_db10 |>
mutate(prey_tot_weight = sum(prey_weight), .by = c(pred_ID, prey_group)) |>
distinct(pred_ID, prey_group, .keep_all = TRUE)mutate: new variable 'prey_tot_weight' (double) with 13,231 unique values and 0% NA
distinct: removed 218,468 rows (64%), 125,413 rows remaining
# summarise(tot_weight = sum(prey_weight), .by = c(pred_ID, prey_group))
# we have preserved the predators
length(unique(missing_db10$pred_ID)) [1] 97412
length(unique(missing_db10_sum$pred_ID)) [1] 97412
# make wide format, the id_cols argument is necessary to identify the columns to keep in the wide
missing_db10_sum2 <- missing_db10_sum |>
pivot_wider( id_cols = c(Country,Estimated_Lat_Lon,ICESrectangle,Year,Month,Day,pred_weight,pred_length,Haul_ID,lat,lon,pred_ID,Depth), names_from = "prey_group", values_from = "prey_tot_weight", values_fill = 0)pivot_wider: reorganized (Dataset, Ship, Latitude, Longitude, Time, …) into
(other, other_inverts, other_chords, unidd_clupeids, herring, …) [was
125413x63, now 97412x20]
# no duplicates
missing_db10_sum2 |> summarise(count = n(), .by = pred_ID) |> filter(count > 1)summarise: now 97,412 rows and 2 columns, ungrouped
filter: removed all rows (100%)
# A tibble: 0 × 2
# ℹ 2 variables: pred_ID <chr>, count <int>
# Split unidentified clupeids on sprat and herring
missing_db10_sum3 <- missing_db10_sum2 |>
mutate(sprat = sprat + unidd_clupeids*0.5,
herring = herring + unidd_clupeids*0.5) |>
dplyr::select(-unidd_clupeids)mutate: changed 919 values (1%) of 'herring' (0 new NAs)
changed 919 values (1%) of 'sprat' (0 new NAs)
# identify if theres is na in the prey group columns
missing_db10_sum3 |> # no there isnt
map(\(.) sum(is.na(.))) $Country
[1] 0
$Estimated_Lat_Lon
[1] 97412
$ICESrectangle
[1] 0
$Year
[1] 0
$Month
[1] 0
$Day
[1] 622
$pred_weight
[1] 90900
$pred_length
[1] 0
$Haul_ID
[1] 0
$lat
[1] 0
$lon
[1] 0
$pred_ID
[1] 0
$Depth
[1] 97412
$other
[1] 0
$other_inverts
[1] 0
$other_chords
[1] 0
$herring
[1] 0
$sprat
[1] 0
$saduria
[1] 0
missing_db10_sum3 |>
filter(is.na(other_chords)) filter: removed all rows (100%)
# A tibble: 0 × 19
# ℹ 19 variables: Country <chr>, Estimated_Lat_Lon <chr>, ICESrectangle <chr>,
# Year <dbl>, Month <chr>, Day <chr>, pred_weight <dbl>, pred_length <dbl>,
# Haul_ID <chr>, lat <dbl>, lon <dbl>, pred_ID <chr>, Depth <dbl>,
# other <dbl>, other_inverts <dbl>, other_chords <dbl>, herring <dbl>,
# sprat <dbl>, saduria <dbl>
# replace na with 0
missing_db10_sum4 <- missing_db10_sum3 |>
mutate(other = replace_na(other,0),
other_inverts = replace_na(other_inverts,0),
other_chords = replace_na(other_chords,0))mutate: no changes
#colnames(missing_db12_sum) #select columns without prey info .
old_data <- missing_db10_sum4 |>
dplyr::select(Country,Estimated_Lat_Lon,ICESrectangle,Year,Month,Day,pred_weight,pred_length,Haul_ID,lat,lon,pred_ID,other_inverts,other_chords,other,herring,sprat,saduria,Depth)
old_data |> # there are unlikely extremes. These are filtered on relative prey weight in the analysis.
rowwise() |>
mutate(total = sum(other, other_inverts, saduria, herring, sprat, other_chords)) |>
filter( total > 1000) |>
arrange(desc(total)) |>
ungroup()mutate: new variable 'total' (double) with 15,297 unique values and 0% NA
filter: removed 97,155 rows (>99%), 257 rows remaining
ungroup: no grouping variables remain
# A tibble: 257 × 20
Country Estimated_Lat_Lon ICESrectangle Year Month Day pred_weight
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl>
1 LV <NA> 43H0 1980 4 19 NA
2 LV <NA> 44H3 1980 12 19 NA
3 LV <NA> 44H1 1981 6 18 NA
4 DK <NA> 39G7 2009 11 14 1274
5 LV <NA> 40G8 1978 3 23 NA
6 LV <NA> 44H1 1981 6 18 NA
7 LV <NA> 38G9 1985 3 16 NA
8 LV <NA> 44H1 1977 10 3 NA
9 LV <NA> 44H1 1982 9 9 NA
10 LV <NA> 39G8 1970 1 26 NA
# ℹ 247 more rows
# ℹ 13 more variables: pred_length <dbl>, Haul_ID <chr>, lat <dbl>, lon <dbl>,
# pred_ID <chr>, other_inverts <dbl>, other_chords <dbl>, other <dbl>,
# herring <dbl>, sprat <dbl>, saduria <dbl>, Depth <dbl>, total <dbl>
old_data |> # there are unlikely extremes. These are filtered on relative prey weight in the analysis.
rowwise() |>
mutate(total = sum(other, other_inverts, saduria, herring, sprat, other_chords),
empty_stom = ifelse(total == 0, 1, 0)) |>
ungroup() |>
summarise(empty_n = n(), .by = empty_stom) # 35 000 empty stomachs... 63 000 not emptymutate: new variable 'total' (double) with 15,297 unique values and 0% NA
new variable 'empty_stom' (double) with 2 unique values and 0% NA
ungroup: no grouping variables remain
summarise: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
empty_stom empty_n
<dbl> <int>
1 1 34598
2 0 62814
5. New SE data
This data has already been prepared/cleaned to som extent in another project
# Calculate total weight of prey by predator ID and prey species (i.e., across prey sizes). First create wide data frame so that I can sum easily across prey groups (columns)
SE_preyTaxamatch_LW <- preyTaxamatch_LW |>
dplyr::select(valid_AphiaID,valid_name,kingdom,phylum,class,order,family,genus,db_AphiaID,db_ScientificName,db_CleanScientificName,abTaxlevel,a,b) |>
mutate(abTaxlevel = factor(abTaxlevel, Taxonomic_hierarchy)) |>
arrange(db_ScientificName, abTaxlevel) |> # here we use db_sciname and not AID
distinct(db_ScientificName, .keep_all = TRUE)mutate: converted 'abTaxlevel' from character to factor (0 new NA)
distinct: removed 81 rows (13%), 523 rows remaining
# Join in as, bs and taxonomy
SE2 <- SE |> # note necessary argument na_matches
left_join(SE_preyTaxamatch_LW, join_by("prey_latin_name"=="db_ScientificName"), na_matches = "never")left_join: added 13 columns (valid_AphiaID, valid_name, kingdom, phylum, class, …)
> rows only in SE 973
> rows only in SE_preyTaxamatch_LW ( 445)
> matched rows 91,895
> ========
> rows total 92,868
# there are no prey to fix here!
SE2 |>
filter(prey_weight_g == 0 & prey_length_cm > 0) |>
distinct(prey_latin_name)filter: removed all rows (100%)
distinct: no rows removed
# A tibble: 0 × 1
# ℹ 1 variable: prey_latin_name <chr>
# Make prey classes
SE3 <- SE2 |> # arguments are evaluated in order so proceed from specific to general
mutate(prey_group = case_when(valid_name == "Clupea harengus"~ "herring",
valid_name == "Sprattus sprattus"~ "sprat",
valid_name == "Saduria entomon" ~ "saduria",
family == "Clupeidae" ~ "unidd_clupeids", # temporary, see below
phylum == "Chordata"~ "other_chords",
kingdom == "Animalia" & !phylum == "Chordata" ~ "other_inverts",
.default = "other"))mutate: new variable 'prey_group' (character) with 7 unique values and 0% NA
SE3 |> summarise(n = n(), .by= prey_group)summarise: now 7 rows and 2 columns, ungrouped
# A tibble: 7 × 2
prey_group n
<chr> <int>
1 sprat 1520
2 herring 572
3 other_chords 3433
4 saduria 6909
5 unidd_clupeids 448
6 other_inverts 76639
7 other 3347
SE4 <- SE3 |>
rename(prey_weight = prey_weight_g,
pred_weight = pred_weight_g,
pred_length = pred_length_cm)rename: renamed 3 variables (pred_weight, pred_length, prey_weight)
# Summarise total prey weight per predator and prey gorup
SE4_sum <- SE4 |>
mutate(prey_tot_weight = sum(prey_weight), .by = c(pred_id, prey_group)) |>
distinct(pred_id, prey_group, .keep_all = TRUE)mutate: new variable 'prey_tot_weight' (double) with 3,060 unique values and 0% NA
distinct: removed 79,751 rows (86%), 13,117 rows remaining
## summarise(tot_weight = sum(prey_weight), .by = c(pred_ID, prey_group))
# make wide format
SE4_sum2 <- SE4_sum |>
pivot_wider( id_cols = c(pred_id,year,month,pred_weight,pred_length,haul_id,lon,lat,date,subdiv,ices_rect,predator_latin_name,bottom_depth), names_from = "prey_group", values_from = "prey_tot_weight", values_fill = 0)pivot_wider: reorganized (gall_bladder, prey_weight, prey_latin_name,
predator_code, quarter, …) into (sprat, herring, other_chords, saduria,
unidd_clupeids, …) [was 13117x53, now 9747x20]
# SE4_sum2 |> distinct(pred_id) |> summarise(n=n())
# Split unidentified clupeids on sprat and herring
SE4_sum3 <- SE4_sum2 |>
mutate(sprat = sprat + unidd_clupeids*0.5,
herring = herring + unidd_clupeids*0.5) |>
dplyr::select(-unidd_clupeids)mutate: changed 344 values (4%) of 'sprat' (0 new NAs)
changed 344 values (4%) of 'herring' (0 new NAs)
SE4_sum3 |> # no na in prey groups
map(\(.) sum(is.na(.)))$pred_id
[1] 0
$year
[1] 0
$month
[1] 0
$pred_weight
[1] 0
$pred_length
[1] 0
$haul_id
[1] 0
$lon
[1] 0
$lat
[1] 0
$date
[1] 0
$subdiv
[1] 0
$ices_rect
[1] 289
$predator_latin_name
[1] 289
$bottom_depth
[1] 289
$sprat
[1] 0
$herring
[1] 0
$other_chords
[1] 0
$saduria
[1] 0
$other_inverts
[1] 0
$other
[1] 0
6. Merge data sets
# rename ids and add a column to separate new from old database
new_db_data <- new_data |>
rename(lat = ShootLat,
lon = ShootLong) |>
mutate(#Haul_ID_tbl = as.character(tblHaulID),
Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_"),
pred_ID = as.character(tblPredatorInformationID),
data_source = "new_db")rename: renamed 2 variables (lat, lon)
mutate: new variable 'Haul_ID' (character) with 1,642 unique values and 0% NA
new variable 'pred_ID' (character) with 35,476 unique values and 0% NA
new variable 'data_source' (character) with one unique value and 0% NA
old_db_data <- old_data |>
mutate(Day = as.double(Day),
Month = as.double(Month),
data_source = "old_db")mutate: converted 'Month' from character to double (0 new NA)
converted 'Day' from character to double (0 new NA)
new variable 'data_source' (character) with one unique value and 0% NA
names(SE4_sum3) [1] "pred_id" "year" "month"
[4] "pred_weight" "pred_length" "haul_id"
[7] "lon" "lat" "date"
[10] "subdiv" "ices_rect" "predator_latin_name"
[13] "bottom_depth" "sprat" "herring"
[16] "other_chords" "saduria" "other_inverts"
[19] "other"
new_SE_data <- SE4_sum3 |>
filter(str_detect(pred_id, "COD")) |>
#filter(predator_latin_name %in% "Gadus morhua") |>
rename(pred_ID = pred_id,
Depth = bottom_depth,
ICESrectangle = ices_rect) |>
separate(haul_id,
sep = "_",
into = c("Year", "Quarter", "HaulNo"),
remove = TRUE) |>
dplyr::select(!Year) |>
rename(Year = year) |>
mutate(Country = "SE",
data_source = "new_SE",
Day = day(date),
Month = month,
ICESrectangle = if_else(is.na(ICESrectangle), ices.rect2(lon, lat), ICESrectangle),
Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_"))filter: removed 3,882 rows (40%), 5,865 rows remaining
rename: renamed 3 variables (pred_ID, ICESrectangle, Depth)
rename: renamed one variable (Year)
mutate: changed 289 values (5%) of 'ICESrectangle' (289 fewer NAs)
new variable 'Country' (character) with one unique value and 0% NA
new variable 'data_source' (character) with one unique value and 0% NA
new variable 'Day' (integer) with 20 unique values and 0% NA
new variable 'Month' (double) with 3 unique values and 0% NA
new variable 'Haul_ID' (character) with 332 unique values and 0% NA
comcol_all_data_pred <- intersect( intersect(colnames(new_db_data), colnames(old_db_data)),
colnames(new_SE_data))
bind_rows( new_db_data |> dplyr::select(Year, pred_ID, data_source),
old_db_data |> dplyr::select(Year, pred_ID, data_source),
new_SE_data |> dplyr::select(Year, pred_ID, data_source) ) |>
summarise(count = n(), .by = c(Year, data_source)) |>
ggplot() +
geom_bar(aes(Year, count, fill = data_source), alpha = 1, stat="identity", position = "stack") +
scale_x_continuous(n.breaks = 10) + ylab("# stomachs")summarise: now 106 rows and 3 columns, ungrouped
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
# Check if any of the two datasets have NA in the common columns.
unique(is.na(old_db_data |> dplyr::select(all_of(comcol_all_data_pred)))) # Depth is all NAs but we dont need it. Day we may need however and these stomachs need to be removed in case day is a predictor. Consider also when adding in environmental data. Pred_weight will be filled in section 7. other_inverts saduria other_chords sprat herring other Year Month Day
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
pred_weight pred_length lat lon ICESrectangle Depth Country Haul_ID
[1,] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
pred_ID data_source
[1,] FALSE FALSE
[2,] FALSE FALSE
[3,] FALSE FALSE
unique(is.na(new_db_data |> dplyr::select(all_of(comcol_all_data_pred)))) other_inverts saduria other_chords sprat herring other Year Month Day
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
pred_weight pred_length lat lon ICESrectangle Depth Country Haul_ID
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
pred_ID data_source
[1,] FALSE FALSE
[2,] FALSE FALSE
unique(is.na(new_SE_data |> dplyr::select(all_of(comcol_all_data_pred)))) other_inverts saduria other_chords sprat herring other Year Month Day
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
pred_weight pred_length lat lon ICESrectangle Depth Country Haul_ID
[1,] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
pred_ID data_source
[1,] FALSE FALSE
[2,] FALSE FALSE
str(old_db_data)tibble [97,412 × 20] (S3: tbl_df/tbl/data.frame)
$ Country : chr [1:97412] "LV" "LV" "LV" "LV" ...
$ Estimated_Lat_Lon: chr [1:97412] NA NA NA NA ...
$ ICESrectangle : chr [1:97412] "37G4" "37G5" "37G5" "37G5" ...
$ Year : num [1:97412] 1968 1973 1973 1973 1973 ...
$ Month : num [1:97412] 9 5 5 5 5 12 12 1 1 1 ...
$ Day : num [1:97412] 19 11 11 11 11 16 16 22 22 22 ...
$ pred_weight : num [1:97412] NA NA NA NA NA NA NA NA NA NA ...
$ pred_length : num [1:97412] 25 17 17 18 16 60 77 18 25 19 ...
$ Haul_ID : chr [1:97412] "LV_1968_9_19_1_37G4" "LV_1973_5_11_15_37G5" "LV_1973_5_11_15_37G5" "LV_1973_5_11_15_37G5" ...
$ lat : num [1:97412] 54.2 54.2 54.2 54.2 54.2 ...
$ lon : num [1:97412] 14.5 15.5 15.5 15.5 15.5 15.5 15.5 19.5 19.5 19.5 ...
$ pred_ID : chr [1:97412] "LV_1968_9_19_1_37G4_52_NA_25" "LV_1973_5_11_15_37G5_100_NA_17" "LV_1973_5_11_15_37G5_69_NA_17" "LV_1973_5_11_15_37G5_74_NA_18" ...
$ other_inverts : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
$ other_chords : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
$ other : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
$ herring : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
$ sprat : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
$ saduria : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
$ Depth : num [1:97412] NA NA NA NA NA NA NA NA NA NA ...
$ data_source : chr [1:97412] "old_db" "old_db" "old_db" "old_db" ...
str(new_db_data)tibble [35,476 × 50] (S3: tbl_df/tbl/data.frame)
$ tblPredatorInformationID: num [1:35476] 55095 55096 55097 55098 55099 ...
$ other_inverts : num [1:35476] 0.12 0.07 0.06 0.06 0.03 0.04 0.15 0.38 0.015 0.053 ...
$ saduria : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
$ other_chords : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
$ sprat : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
$ herring : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
$ other : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
$ tblUploadID : num [1:35476] 8247 8247 8247 8247 8247 ...
$ tblHaulID : num [1:35476] 4701 4702 4702 4702 4702 ...
$ Ship : chr [1:35476] "90MZ" "90MZ" "90MZ" "90MZ" ...
$ Gear : chr [1:35476] "LBT" "LBT" "LBT" "LBT" ...
$ HaulNo : num [1:35476] 1 14 14 14 14 14 30 30 1 1 ...
$ StationNumber : num [1:35476] 1 1 1 1 1 1 1 1 3 3 ...
$ Year : num [1:35476] 1969 1969 1969 1969 1969 ...
$ Month : num [1:35476] 1 1 1 1 1 1 1 1 3 3 ...
$ Day : num [1:35476] 8 19 19 19 19 19 24 24 15 15 ...
$ Time : chr [1:35476] "0710" "0720" "0720" "0720" ...
$ FishID : num [1:35476] 95 1 2 3 5 6 31 32 82 83 ...
$ AphiaIDPredator : num [1:35476] 126436 126436 126436 126436 126436 ...
$ pred_weight : num [1:35476] 15.6 2.26 1.7 1.04 1.92 1.8 22.5 23.7 1.35 2.69 ...
$ Number : num [1:35476] 1 1 1 1 1 1 1 1 1 1 ...
$ MeasurementIncrement : num [1:35476] 1 1 1 1 1 1 1 1 1 1 ...
$ pred_length : num [1:35476] 13 8 8 6 6 6 15 15 6 7 ...
$ Code : chr [1:35476] NA NA NA NA ...
$ Age : num [1:35476] NA NA NA NA NA NA NA NA NA NA ...
$ Sex : chr [1:35476] NA NA NA NA ...
$ MaturityScale : chr [1:35476] NA NA NA NA ...
$ MaturityStage : num [1:35476] NA NA NA NA NA NA NA NA NA NA ...
$ PreservationMethod : chr [1:35476] "NBF" "NBF" "NBF" "NBF" ...
$ Regurgitated : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
$ StomachFullness : logi [1:35476] NA NA NA NA NA NA ...
$ FullStomWgt : logi [1:35476] NA NA NA NA NA NA ...
$ EmptyStomWgt : logi [1:35476] NA NA NA NA NA NA ...
$ StomachEmpty : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
$ GenSamp : chr [1:35476] "N" "N" "N" "N" ...
$ lat : num [1:35476] 57.5 57.2 57.2 57.2 57.2 ...
$ lon : num [1:35476] 21.1 21.1 21.1 21.1 21.1 ...
$ HaulLat : num [1:35476] NA NA NA NA NA NA NA NA NA NA ...
$ HaulLong : num [1:35476] NA NA NA NA NA NA NA NA NA NA ...
$ ICESrectangle : chr [1:35476] "43H1" "43H1" "43H1" "43H1" ...
$ Depth : num [1:35476] 70 26 26 26 26 26 78 78 48 48 ...
$ Survey : chr [1:35476] NA NA NA NA ...
$ ICESDatabase : chr [1:35476] "N" "N" "N" "N" ...
$ Country : chr [1:35476] "LV" "LV" "LV" "LV" ...
$ Reporting_organisation : num [1:35476] 3140 3140 3140 3140 3140 3140 3140 3140 3140 3140 ...
$ CruiseID : chr [1:35476] "LV314090MZ1969" "LV314090MZ1969" "LV314090MZ1969" "LV314090MZ1969" ...
$ Regurgitated_st : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
$ Haul_ID : chr [1:35476] "LV_1969_1_8_1_43H1" "LV_1969_1_19_14_43H1" "LV_1969_1_19_14_43H1" "LV_1969_1_19_14_43H1" ...
$ pred_ID : chr [1:35476] "55095" "55096" "55097" "55098" ...
$ data_source : chr [1:35476] "new_db" "new_db" "new_db" "new_db" ...
# remove Time, Depth and Ship (ship makes w-o columns .x and .y)
all_data <- full_join( old_db_data |> dplyr::select(-c(Depth)), # join in predator data
new_db_data |> dplyr::select(-c(Depth, Ship)),
by = comcol_all_data_pred[!comcol_all_data_pred %in% c("Depth","Ship")]) |>
full_join(new_SE_data, by = comcol_all_data_pred[!comcol_all_data_pred %in% c("Depth","Ship")] )full_join: added 30 columns (tblPredatorInformationID, tblUploadID, tblHaulID, Gear, HaulNo, …)
> rows only in dplyr::select(old_db_da.. 97,412
> rows only in dplyr::select(new_db_da.. 35,476
> matched rows 0
> =========
> rows total 132,888
full_join: added 8 columns (HaulNo.x, month, Quarter, HaulNo.y, date, …)
> rows only in full_join(dplyr::select.. 132,888
> rows only in new_SE_data 5,865
> matched rows 0
> =========
> rows total 138,753
7. Estimate weight of predators using yearly lw coefficients
The size of the cod in both the new db and old db are mainly represented by lengh and the length-weight relationship in eastern baltic cod has varied over time. This will likely affect our relative prey weight estimates and we want to account for this by estimate a yearly a and b using combined data from BITS (1978-) and the stomach data (1963-).
# Model a yearly LW-relationship as a linear model of log(weight) ~ log(length). And then fit gam with a a smooth of year to smooth out the variation in a and b over time.
# identify data to use for estimating in the stomach data
all_data2 <- all_data |>
mutate(pred_weight = replace_na(pred_weight, 0),
pred_weight_source = ifelse(pred_weight <= 0, "estimated", "observed"))mutate: changed 91,587 values (66%) of 'pred_weight' (91,587 fewer NAs)
new variable 'pred_weight_source' (character) with 2 unique values and 0% NA
# 66% of obs contains only length
all_data2 |>
summarise(percobs = n() / nrow(all_data2) * 100, .by = pred_weight_source)summarise: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
pred_weight_source percobs
<chr> <dbl>
1 estimated 66.1
2 observed 33.9
# load BITS data
lw_data_bits <- read_csv(paste0(home, "/data/survey/historical_lw_data.csv")) Rows: 414145 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): sub_div, haul_id, species
dbl (8): year, lngt_cm, ind_wgt, lon, lat, quarter, month, day
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# combine bits with stomach data on lw
lw_data_stombits <- lw_data_bits |>
filter(species == "cod") |>
dplyr::select(year,lngt_cm,ind_wgt) |>
rename(Year = year,
pred_length = lngt_cm,
pred_weight = ind_wgt) |>
bind_rows( all_data2 |> filter(pred_weight_source == "observed") |> dplyr::select(Year, pred_weight, pred_length))filter: removed 112,502 rows (27%), 301,643 rows remaining
rename: renamed 3 variables (Year, pred_length, pred_weight)
filter: removed 91,779 rows (66%), 46,974 rows remaining
# fit yearly lm models
ebc_yearly_lw <- lw_data_stombits |>
group_by(Year) |>
group_modify( ~ tidy(lm(data = .x, log(pred_weight) ~ log(pred_length)))) |>
pivot_wider(id_cols=Year, names_from = term, values_from = estimate ) |>
rename(a = '(Intercept)',
b = 'log(pred_length)') |>
mutate(a = exp(a),
b = b) |>
ungroup()group_by: one grouping variable (Year)
pivot_wider: reorganized (term, estimate, std.error, statistic, p.value) into ((Intercept), log(pred_length)) [was 122x6, now 61x3]
rename: renamed 2 variables (a, b)
mutate (grouped): changed 61 values (100%) of 'a' (0 new NAs)
ungroup: no grouping variables remain
# plot a and b over time
ebc_yearly_lw |>
filter(a < 1) |>
pivot_longer(cols = c(a,b), names_to = "coeff", values_to = "value") |>
ggplot(aes(Year, value, color = coeff)) +
geom_point() +
facet_wrap(~coeff, scales = "free") +
stat_smooth(method = "gam", formula = y ~ s(x, k=3)) +
expand_limits(x=c(1960,2020)) +
labs("using a stat_smooth gam")filter: no rows removed
pivot_longer: reorganized (a, b) into (coeff, value) [was 61x3, now 122x3]
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
# IF USING GAM to smooth out the large variation between years.
# # fit a gam for a and b to get smoothed predictions of a and b.
# agam <- gam(a ~ s(Year),
# data = ebc_yearly_lw,
# family = gaussian())
#
# bgam <- gam(b ~ s(Year),
# data = ebc_yearly_lw,
# family = gaussian())
# #predict
# apred <- predict_gam(agam, values = data.frame( Year = unique(ebc_yearly_lw$Year)))
# bpred <- predict_gam(bgam, values = data.frame( Year = unique(ebc_yearly_lw$Year)))
# combine to plot
# abpred <- bind_rows(apred |> rename(est=a) |> mutate(coeff= "a"), bpred |> rename(est=b) |> mutate(coeff= #"b"))
#
# abpred |>
# ggplot() +
# geom_line(aes(Year, est, color = coeff)) +
# geom_point(data = ebc_yearly_lw |> filter(a < 1) |>
# pivot_longer(cols = c(a,b), names_to = "coeff", values_to = "value"), aes(Year, value, color= coeff)) +
# geom_ribbon(aes(ymin = lower_ci, ymax = upper_ci, x=Year), color = NA, alpha = 0.3) +
# facet_wrap(~coeff, scales = "free") +
# expand_limits(x=c(1960,2020)) +
# labs("modelled gam smooth with a and b from linear model, unsmooth pattern is due to predictions on exact year values I think")
#
# join in a and b
# all_data3 <- all_data2 |>
# left_join(apred, join_by(Year)) |>
# left_join(bpred, join_by(Year))
#
# If not using gam to smooth variation in a a nd b between years:
all_data3 <- all_data2 |>
left_join(ebc_yearly_lw, join_by(Year))left_join: added 2 columns (a, b)
> rows only in all_data2 0
> rows only in ebc_yearly_lw ( 2)
> matched rows 138,753
> =========
> rows total 138,753
nrow(all_data3) - nrow(all_data2)[1] 0
# test the effect of using yearly a and bs insetad of mean general cods from fishbase
length_decade_ab <- expand.grid(length = 10:50, decade=seq(1960,2020, by=10)) |>
left_join(ebc_yearly_lw |> mutate(decade = round(Year/10) * 10) |> ungroup() |> summarise(meana = mean(a), .by= c(decade))) |>
left_join(ebc_yearly_lw |> mutate(decade = round(Year/10) * 10) |> ungroup() |> summarise(meanb = mean(b), .by= c(decade))) |>
mutate(tvar_ab = meana*length^meanb,
fb_ab = 0.00712575*length^3.098685) # mean cod fishbase a and bmutate: new variable 'decade' (double) with 7 unique values and 0% NA
ungroup: no grouping variables remain
summarise: now 7 rows and 2 columns, ungrouped
Joining with `by = join_by(decade)`
left_join: added one column (meana)
> rows only in expand.grid(length = 10.. 0
> rows only in summarise(ungroup(mutat.. ( 0)
> matched rows 287
> =====
> rows total 287
mutate: new variable 'decade' (double) with 7 unique values and 0% NA
ungroup: no grouping variables remain
summarise: now 7 rows and 2 columns, ungrouped
Joining with `by = join_by(decade)`
left_join: added one column (meanb)
> rows only in left_join(expand.grid(l.. 0
> rows only in summarise(ungroup(mutat.. ( 0)
> matched rows 287
> =====
> rows total 287
mutate: new variable 'tvar_ab' (double) with 287 unique values and 0% NA
new variable 'fb_ab' (double) with 41 unique values and 0% NA
# plot
length_decade_ab |>
ggplot(aes(length, tvar_ab, color = factor(decade))) +
geom_line() +
geom_line(aes(length, fb_ab), color = "black", linetype = "dashed") +
scale_color_viridis_d() +
labs(title = "black dashed line is weight from fishbase average a and b", y = "weight [g]")Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
# estimate weight based on year a and b and on fishbase a and b for comparison in analysis.
all_data4 <- all_data3 |>
mutate(pred_weight_source = ifelse(pred_weight <= 0, "estimated", "observed"),
pred_weight = ifelse(pred_weight_source == "estimated", a*pred_length^b, pred_weight),
pred_weight_fb = ifelse(pred_weight_source == "estimated", 0.00712575*pred_length^3.098685, pred_weight)) # mean cod fishbase a and bmutate: changed 91,779 values (66%) of 'pred_weight' (0 new NAs)
new variable 'pred_weight_fb' (double) with 5,700 unique values and 0% NA
# Estimated weights using time varying a and b
all_data4 |>
arrange(desc(pred_weight_source)) |>
ggplot(aes(pred_length, pred_weight, color = pred_weight_source)) +
geom_point(shape = 4) +
labs(title = "Estimated weights using time varying a and b", y = "weight [g]")Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
8. Calculate relative prey weights
# Calculate relative prey weights and remove values above 1
all_data5 <- all_data4 |>
rowwise() |>
mutate(rpw_sad = saduria / pred_weight,
rpw_spr = sprat / pred_weight,
rpw_her = herring / pred_weight,
rpw_other = other / pred_weight,
rpw_other_inverts = other_inverts / pred_weight,
rpw_other_chords = other_chords / pred_weight,
rpw_tot = sum(other, other_inverts, other_chords, saduria, sprat, herring) / pred_weight,
rpw_tot_fb = sum(other, other_inverts, other_chords, saduria, sprat, herring) / pred_weight_fb) |>
filter(between(rpw_tot, 0, 1 )) |> ungroup()mutate: new variable 'rpw_sad' (double) with 26,937 unique values and 0% NA
new variable 'rpw_spr' (double) with 18,953 unique values and 0% NA
new variable 'rpw_her' (double) with 9,975 unique values and 0% NA
new variable 'rpw_other' (double) with 1,162 unique values and 0% NA
new variable 'rpw_other_inverts' (double) with 49,220 unique values and 0% NA
new variable 'rpw_other_chords' (double) with 10,213 unique values and 0% NA
new variable 'rpw_tot' (double) with 81,418 unique values and 0% NA
new variable 'rpw_tot_fb' (double) with 65,312 unique values and 0% NA
filter: removed 249 rows (<1%), 138,504 rows remaining
ungroup: no grouping variables remain
all_data5 |> # log so we can see the data better
dplyr::select(rpw_sad, rpw_spr, rpw_her, rpw_tot, rpw_other, rpw_other_chords, rpw_other_inverts) |>
#dplyr::select(rpw_tot) |>
pivot_longer(everything()) |>
ggplot(aes(log(value))) +
geom_histogram(bins = 100) +
facet_wrap(~name, ncol = 2, scales = "free")pivot_longer: reorganized (rpw_sad, rpw_spr, rpw_her, rpw_tot, rpw_other, …)
into (name, value) [was 138504x7, now 969528x2]
Warning: Removed 746848 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
Check the proportion of stomachs without these prey
all_data5 |>
pivot_longer(c("herring", "saduria", "sprat","other", "other_inverts", "other_chords")) |>
summarise(prop_empty = sum(value == 0)/n(),
prop_not_empty = sum(value != 0)/n(), .by = name)pivot_longer: reorganized (other_inverts, other_chords, other, herring, sprat, …) into (name, value) [was 138504x68, now 831024x64]
summarise: now 6 rows and 3 columns, ungrouped
# A tibble: 6 × 3
name prop_empty prop_not_empty
<chr> <dbl> <dbl>
1 herring 0.927 0.0725
2 saduria 0.792 0.208
3 sprat 0.861 0.139
4 other 0.991 0.00858
5 other_inverts 0.569 0.431
6 other_chords 0.922 0.0785
all_data5 |>
pivot_longer(c("herring", "saduria", "sprat","other", "other_inverts", "other_chords")) |>
summarise(prop_empty = sum(value == 0)/n(),
prop_not_empty = sum(value != 0)/n(), .by = c(name, Year)) |>
ggplot(aes(Year, prop_empty, color = name)) +
geom_line() +
facet_wrap(~name) +
guides(color = "none")pivot_longer: reorganized (other_inverts, other_chords, other, herring, sprat, …) into (name, value) [was 138504x68, now 831024x64]
summarise: now 354 rows and 4 columns, ungrouped
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
Seems like fairly high proportions of cod without these key species in stomachs, but at least ~half have other prey in their stomachs
9. Remove points on land and outside Baltic
# Inspired by code from the map-plot function (in ...R/functions) but using terra instead of sf
# Specify map ranges
ymin = 52; ymax = 60.5; xmin = 13; xmax = 24
map_data <- rnaturalearth::ne_countries(
scale = "medium",
returnclass = "sv", continent = "europe")
# Crop the polygon for plotting and efficiency:
# st_bbox(map_data) # find the rough coordinates
swe_coast <- suppressWarnings(suppressMessages(
terra::crop(map_data,
c(xmin = xmin, ymin = ymin, xmax = xmax, ymax = ymax))))
# Change projection of the polygons and make SpatVector of the points
swe_coast_proj <- project(swe_coast, "epsg:32633") # utm_zone33
dat_vect <- vect(all_data5, geom=c("lon", "lat"))
# which points intesect with the land polygons
onland <- is.related(dat_vect, swe_coast, "intersects")
# seems correct
plot_map_fc +
geom_point(data = all_data5[onland,] |> add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633), aes(X*1000, Y*1000), color="red") +
theme_sleek(base_size = 6)Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
# remove the point on land (7500 stomachs)
all_data6 <- all_data5[!onland,]
#remove points outside the Baltic, i.e. the western most points in:
all_data6 |>
ggplot(aes(lon, lat)) +
geom_point() +
geom_vline(xintercept = 13) +
coord_sf()Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
all_data6 <- all_data6 |>
filter( lon > 13)filter: removed 135 rows (<1%), 130,614 rows remaining
10. Last fixes, spatiotemporal distribution and save data
# add day of the year. When Day is missing (622 rows), I assume that day is 15, ie.e mid month, to not loose data for the sake of a not very important variable.
all_data7 <- all_data6 |>
mutate(day_of_year = if_else(!is.na(Day),
yday(as.Date(paste(Year, Month, Day, sep = "-"))),
yday(as.Date(paste(Year, Month, 15, sep = "-")))))mutate: new variable 'day_of_year' (double) with 277 unique values and 0% NA
# Fix some columns and select
df <- all_data7 |>
dplyr::select(!month) |> #remove errornous month
rename(ices_rect = ICESrectangle,
year = Year,
month = Month,
day = Day) |>
add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633) |>
dplyr::select(pred_ID, Haul_ID, herring, saduria, sprat, other, other_inverts, other_chords, rpw_tot, rpw_tot_fb, rpw_sad, rpw_spr, rpw_her, rpw_other, rpw_other_inverts, rpw_other_chords, year, month, day, day_of_year, pred_weight, pred_weight_fb, pred_length, pred_weight_source, lat, lon, ices_rect, X, Y, data_source, Country, Depth)rename: renamed 4 variables (ices_rect, year, month, day)
# Add sample size per coordinate for plotting
df_plot <- df |>
group_by(year, Y, X) |>
mutate(sample_size = n(),
pos_id = paste(year, X, Y),
decade = round(year/10) * 10) |>
ungroup() |>
distinct(pos_id, .keep_all = TRUE)group_by: 3 grouping variables (year, Y, X)
mutate (grouped): new variable 'sample_size' (integer) with 299 unique values and 0% NA
new variable 'pos_id' (character) with 2,734 unique values and 0% NA
new variable 'decade' (double) with 7 unique values and 0% NA
ungroup: no grouping variables remain
distinct: removed 127,880 rows (98%), 2,734 rows remaining
plot_map_fc +
geom_point(data = df_plot, aes(X*1000, Y*1000, size = sample_size), alpha = 0.5) +
facet_wrap(~ decade, ncol = 4) +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 90)) +
scale_size(range = c(.01, 2), name = "# stomachs") +
geom_sf()Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
ggsave(paste0(home, "/figures/supp/year_diet_map.pdf"), width = 15, height = 15, units = "cm")Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
check for duplicates
# zero dupes
df |>
summarise(count = n(), .by = c(pred_ID, data_source, year, Country)) |>
filter(count > 1)summarise: now 130,614 rows and 5 columns, ungrouped
filter: removed all rows (100%)
# A tibble: 0 × 5
# ℹ 5 variables: pred_ID <chr>, data_source <chr>, year <dbl>, Country <chr>,
# count <int>
# Save data
# v2 as of 2025-06-26
write_csv(df, paste0(home, "/data/stomach/stomachs_v2.csv")) #